Browse Source

add multigraph debugger support, update contion, search category, tensor summary

tags/v1.1.0
maning202007 5 years ago
parent
commit
f516158501
96 changed files with 10803 additions and 1467 deletions
  1. +26
    -0
      mindinsight/backend/conditionmgr/__init__.py
  2. +46
    -0
      mindinsight/backend/conditionmgr/conditionmgr_api.py
  3. +76
    -14
      mindinsight/backend/debugger/debugger_api.py
  4. +15
    -0
      mindinsight/conditionmgr/__init__.py
  5. +15
    -0
      mindinsight/conditionmgr/common/__init__.py
  6. +18
    -0
      mindinsight/conditionmgr/common/utils.py
  7. +232
    -0
      mindinsight/conditionmgr/condition.py
  8. +599
    -0
      mindinsight/conditionmgr/condition_list.py
  9. +132
    -0
      mindinsight/conditionmgr/conditionmgr.py
  10. +19
    -0
      mindinsight/conditionmgr/log.py
  11. +365
    -0
      mindinsight/conditionmgr/recommender.py
  12. +2
    -0
      mindinsight/conf/constants.py
  13. +2
    -2
      mindinsight/datavisual/data_transform/graph/msgraph.py
  14. +3
    -1
      mindinsight/datavisual/data_transform/graph/node.py
  15. +1
    -1
      mindinsight/datavisual/data_transform/graph/node_tree.py
  16. +2
    -0
      mindinsight/debugger/common/exceptions/error_code.py
  17. +11
    -0
      mindinsight/debugger/common/exceptions/exceptions.py
  18. +1
    -1
      mindinsight/debugger/common/log.py
  19. +7
    -4
      mindinsight/debugger/common/utils.py
  20. +1
    -1
      mindinsight/debugger/debugger_cache.py
  21. +92
    -23
      mindinsight/debugger/debugger_grpc_server.py
  22. +299
    -118
      mindinsight/debugger/debugger_server.py
  23. +26
    -5
      mindinsight/debugger/proto/debug_grpc.proto
  24. +156
    -25
      mindinsight/debugger/proto/debug_grpc_pb2.py
  25. +52
    -13
      mindinsight/debugger/proto/debug_grpc_pb2_grpc.py
  26. +177
    -7
      mindinsight/debugger/stream_cache/debugger_graph.py
  27. +81
    -0
      mindinsight/debugger/stream_cache/debugger_multigraph.py
  28. +143
    -0
      mindinsight/debugger/stream_cache/node_type_identifier.py
  29. +13
    -1
      mindinsight/debugger/stream_cache/tensor.py
  30. +77
    -45
      mindinsight/debugger/stream_cache/watchpoint.py
  31. +1
    -1
      mindinsight/debugger/stream_handler/event_handler.py
  32. +472
    -104
      mindinsight/debugger/stream_handler/graph_handler.py
  33. +37
    -5
      mindinsight/debugger/stream_handler/metadata_handler.py
  34. +59
    -14
      mindinsight/debugger/stream_handler/tensor_handler.py
  35. +368
    -93
      mindinsight/debugger/stream_handler/watchpoint_handler.py
  36. +15
    -0
      mindinsight/debugger/stream_operator/__init__.py
  37. +120
    -0
      mindinsight/debugger/stream_operator/tensor_detail_info.py
  38. +61
    -6
      mindinsight/utils/tensor.py
  39. +8
    -2
      tests/st/func/debugger/conftest.py
  40. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/before_train_begin.json
  41. +75
    -56
      tests/st/func/debugger/expect_results/restful_results/compare_tensors.json
  42. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/create_and_delete_watchpoint.json
  43. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/get_conditions_for_ascend.json
  44. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/get_conditions_for_gpu.json
  45. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/multi_next_node.json
  46. +672
    -0
      tests/st/func/debugger/expect_results/restful_results/multi_retrieve_aggregation_scope_node.json
  47. +44
    -0
      tests/st/func/debugger/expect_results/restful_results/multi_retrieve_all.json
  48. +534
    -0
      tests/st/func/debugger/expect_results/restful_results/multi_retrieve_scope_node.json
  49. +1735
    -0
      tests/st/func/debugger/expect_results/restful_results/multi_retrieve_single_node.json
  50. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/multi_retrieve_watchpoint.json
  51. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/multi_run_steps.json
  52. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/recommended_watchpoints_at_startup.json
  53. +1
    -672
      tests/st/func/debugger/expect_results/restful_results/retrieve_aggregation_scope_node.json
  54. +6
    -1
      tests/st/func/debugger/expect_results/restful_results/retrieve_all.json
  55. +5
    -6
      tests/st/func/debugger/expect_results/restful_results/retrieve_empty_tensor_history.json
  56. +3
    -4
      tests/st/func/debugger/expect_results/restful_results/retrieve_full_tensor_history.json
  57. +547
    -1
      tests/st/func/debugger/expect_results/restful_results/retrieve_next_node_on_gpu.json
  58. +1737
    -1
      tests/st/func/debugger/expect_results/restful_results/retrieve_single_watchpoint_hit.json
  59. +138
    -0
      tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json
  60. +72
    -0
      tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-1.json
  61. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_hits-0.json
  62. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_hits-1.json
  63. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_value.json
  64. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json
  65. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/search_activation.json
  66. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/search_activation_multi_graph.json
  67. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/search_gradient.json
  68. +31
    -1
      tests/st/func/debugger/expect_results/restful_results/search_unwatched_leaf_node.json
  69. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/search_weight.json
  70. +1
    -0
      tests/st/func/debugger/expect_results/restful_results/search_weight_multi_graph.json
  71. +22
    -7
      tests/st/func/debugger/mock_ms_client.py
  72. +419
    -86
      tests/st/func/debugger/test_restful_api.py
  73. +11
    -6
      tests/st/func/debugger/utils.py
  74. +12
    -19
      tests/ut/debugger/configurations.py
  75. +1
    -1
      tests/ut/debugger/expected_results/debugger_server/retrieve_all.json
  76. +36
    -1
      tests/ut/debugger/expected_results/debugger_server/retrieve_tensor_history.json
  77. +197
    -0
      tests/ut/debugger/expected_results/graph/get_tensor_graph-0.json
  78. +176
    -0
      tests/ut/debugger/expected_results/graph/get_tensor_graph-1.json
  79. +166
    -0
      tests/ut/debugger/expected_results/graph/get_tensor_graph-2.json
  80. +1
    -1
      tests/ut/debugger/expected_results/graph/graph_handler_get_1_no_filter_condintion.json
  81. +1
    -0
      tests/ut/debugger/expected_results/graph/search_nodes_by_type_0.json
  82. +1
    -0
      tests/ut/debugger/expected_results/graph/search_nodes_by_type_1.json
  83. +0
    -1
      tests/ut/debugger/expected_results/graph/tenor_hist_0.json
  84. +18
    -0
      tests/ut/debugger/expected_results/graph/tensor_hist_0.json
  85. +53
    -1
      tests/ut/debugger/expected_results/graph/tensor_hist_1.json
  86. +6
    -14
      tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_0.json
  87. +1
    -1
      tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_1.json
  88. +22
    -1
      tests/ut/debugger/expected_results/watchpoint/watchpoint_hit_handler_get_0.json
  89. +15
    -0
      tests/ut/debugger/stream_cache/__init__.py
  90. +77
    -0
      tests/ut/debugger/stream_cache/test_node_type_identifier.py
  91. +27
    -4
      tests/ut/debugger/stream_handler/test_graph_handler.py
  92. +1
    -30
      tests/ut/debugger/stream_handler/test_tensor_handler.py
  93. +67
    -43
      tests/ut/debugger/stream_handler/test_watchpoint_handler.py
  94. +8
    -5
      tests/ut/debugger/test_debugger_grpc_server.py
  95. +17
    -12
      tests/ut/debugger/test_debugger_server.py
  96. +1
    -2
      tests/utils/tools.py

+ 26
- 0
mindinsight/backend/conditionmgr/__init__.py View File

@@ -0,0 +1,26 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Module init file."""
from mindinsight.backend.conditionmgr.conditionmgr_api import init_module as init_query_module


def init_module(app):
"""
Init module entry.

Args:
app (Flask): A Flask instance.
"""
init_query_module(app)

+ 46
- 0
mindinsight/backend/conditionmgr/conditionmgr_api.py View File

@@ -0,0 +1,46 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Conditionmgr restful api."""
from flask import Blueprint

from mindinsight.conf import settings
from mindinsight.backend.debugger.debugger_api import BACKEND_SERVER, _wrap_reply

BLUEPRINT = Blueprint("conditionmgr", __name__,
url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX)


@BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/conditions", methods=["GET"])
def get_conditions(train_id):
"""get conditions"""
reply = _wrap_reply(BACKEND_SERVER.get_conditions, train_id)
return reply


@BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/condition-collections", methods=["GET"])
def get_condition_collections(train_id):
"""get condition collections"""
reply = _wrap_reply(BACKEND_SERVER.get_condition_collections, train_id)
return reply


def init_module(app):
"""
Init module entry.

Args:
app (Flask): The application obj.
"""
app.register_blueprint(BLUEPRINT)

+ 76
- 14
mindinsight/backend/debugger/debugger_api.py View File

@@ -88,11 +88,16 @@ def search():
str, the required data.

Examples:
>>> Get http://xxxx/v1/mindinsight/debugger/retrive?mode=all
>>> Get http://xxxx/v1/mindinsight/debugger/search?name=mock_name&watch_point_id=1
"""
name = request.args.get('name')
graph_name = request.args.get('graph_name')
watch_point_id = int(request.args.get('watch_point_id', 0))
reply = _wrap_reply(BACKEND_SERVER.search, name, watch_point_id)
node_category = request.args.get('node_category')
reply = _wrap_reply(BACKEND_SERVER.search, {'name': name,
'graph_name': graph_name,
'watch_point_id': watch_point_id,
'node_category': node_category})

return reply

@@ -109,9 +114,10 @@ def retrieve_node_by_bfs():
>>> Get http://xxxx/v1/mindinsight/debugger/retrieve_node_by_bfs?name=node_name&ascend=true
"""
name = request.args.get('name')
graph_name = request.args.get('graph_name')
ascend = request.args.get('ascend', 'false')
ascend = ascend == 'true'
reply = _wrap_reply(BACKEND_SERVER.retrieve_node_by_bfs, name, ascend)
reply = _wrap_reply(BACKEND_SERVER.retrieve_node_by_bfs, name, graph_name, ascend)

return reply

@@ -167,7 +173,8 @@ def retrieve_tensor_history():
"""
body = _read_post_request(request)
name = body.get('name')
reply = _wrap_reply(BACKEND_SERVER.retrieve_tensor_history, name)
graph_name = body.get('graph_name')
reply = _wrap_reply(BACKEND_SERVER.retrieve_tensor_history, name, graph_name)
return reply


@@ -180,12 +187,15 @@ def retrieve_tensor_value():
str, the required data.

Examples:
>>> GET http://xxxx/v1/mindinsight/debugger/tensors?name=node_name&detail=data&shape=[1,1,:,:]
>>> GET http://xxxx/v1/mindinsight/debugger/tensors?name=tensor_name&detail=data&shape=[1,1,:,:]
"""
name = request.args.get('name')
detail = request.args.get('detail')
shape = request.args.get('shape')
reply = _wrap_reply(BACKEND_SERVER.retrieve_tensor_value, name, detail, shape)
graph_name = request.args.get('graph_name')
prev = bool(request.args.get('prev') == 'true')

reply = _wrap_reply(BACKEND_SERVER.retrieve_tensor_value, name, detail, shape, graph_name, prev)
return reply


@@ -199,7 +209,6 @@ def create_watchpoint():

Raises:
MindInsightException: If method fails to be called.
ParamValueError: If parsing json data search_condition fails.

Examples:
>>> POST http://xxxx/v1/mindinsight/debugger/create_watchpoint
@@ -207,9 +216,12 @@ def create_watchpoint():
body = _read_post_request(request)

condition = body.get('condition')
graph_name = body.get('graph_name')
watch_nodes = body.get('watch_nodes')
watch_point_id = body.get('watch_point_id')
reply = _wrap_reply(BACKEND_SERVER.create_watchpoint, condition, watch_nodes, watch_point_id)
search_pattern = body.get('search_pattern')
reply = _wrap_reply(BACKEND_SERVER.create_watchpoint,
condition, watch_nodes, watch_point_id, search_pattern, graph_name)
return reply


@@ -223,7 +235,6 @@ def update_watchpoint():

Raises:
MindInsightException: If method fails to be called.
ParamValueError: If parsing json data search_condition fails.

Examples:
>>> POST http://xxxx/v1/mindinsight/debugger/update_watchpoint
@@ -232,10 +243,10 @@ def update_watchpoint():

watch_point_id = body.get('watch_point_id')
watch_nodes = body.get('watch_nodes')
graph_name = body.get('graph_name')
mode = body.get('mode')
name = body.get('name')
reply = _wrap_reply(BACKEND_SERVER.update_watchpoint, watch_point_id, watch_nodes, mode, name)

pattern = body.get('search_pattern')
reply = _wrap_reply(BACKEND_SERVER.update_watchpoint, watch_point_id, watch_nodes, mode, pattern, graph_name)
return reply


@@ -249,7 +260,6 @@ def delete_watchpoint():

Raises:
MindInsightException: If method fails to be called.
ParamValueError: If parsing json data search_condition fails.

Examples:
>>> POST http://xxxx/v1/mindinsight/debugger/delete_watchpoint
@@ -273,7 +283,6 @@ def control():

Raises:
MindInsightException: If method fails to be called.
ParamValueError: If parsing json data search_condition fails.

Examples:
>>> POST http://xxxx/v1/mindinsight/debugger/control
@@ -284,6 +293,59 @@ def control():
return reply


@BLUEPRINT.route("/debugger/recheck", methods=["POST"])
def recheck():
"""
Recheck request.

Returns:
str, reply message.

Raises:
MindInsightException: If method fails to be called.

Examples:
>>> POST http://xxxx/v1/mindinsight/debugger/recheck
"""
reply = _wrap_reply(BACKEND_SERVER.recheck)

return reply


@BLUEPRINT.route("/debugger/tensor_graphs", methods=["GET"])
def retrieve_tensor_graph():
"""
Retrieve tensor value according to name and shape.

Returns:
str, the required data.

Examples:
>>> GET http://xxxx/v1/mindinsight/debugger/tensor_graphs?tensor_name=tensor_name$graph_name=graph_name
"""
tensor_name = request.args.get('tensor_name')
graph_name = request.args.get('graph_name')
reply = _wrap_reply(BACKEND_SERVER.retrieve_tensor_graph, tensor_name, graph_name)
return reply


@BLUEPRINT.route("/debugger/tensor_hits", methods=["GET"])
def retrieve_tensor_hits():
"""
Retrieve tensor value according to name and shape.

Returns:
str, the required data.

Examples:
>>> GET http://xxxx/v1/mindinsight/debugger/tensor_hits?tensor_name=tensor_name$graph_name=graph_name
"""
tensor_name = request.args.get('tensor_name')
graph_name = request.args.get('graph_name')
reply = _wrap_reply(BACKEND_SERVER.retrieve_tensor_hits, tensor_name, graph_name)
return reply


BACKEND_SERVER = _initialize_debugger_server()




+ 15
- 0
mindinsight/conditionmgr/__init__.py View File

@@ -0,0 +1,15 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Provide condition manager function."""

+ 15
- 0
mindinsight/conditionmgr/common/__init__.py View File

@@ -0,0 +1,15 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Common module."""

+ 18
- 0
mindinsight/conditionmgr/common/utils.py View File

@@ -0,0 +1,18 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Utils module."""
from collections import namedtuple

NodeBasicInfo = namedtuple('node_basic_info', ['name', 'full_name', 'type'])

+ 232
- 0
mindinsight/conditionmgr/condition.py View File

@@ -0,0 +1,232 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Management of all conditions.

This module is used to register all conditions, as well as their parameters.
This module also provide the available conditions to condition_collections api.
"""
from enum import Enum
from mindinsight.conditionmgr.log import logger


class ConditionIdEnum(Enum):
"""Condition ids."""
WEIGHT_INITIALIZATION = "weight_initialization"
WEIGHT_OVERFLOW = "weight_overflow"
WEIGHT_TOO_LARGE = "weight_too_large"
WEIGHT_TOO_SMALL = "weight_too_small"
GRADIENT_VANISHING = "gradient_vanishing"
GRADIENT_TOO_LARGE = "gradient_too_large"
GRADIENT_EXPLODING = "gradient_exploding"
TENSOR_OVERFLOW = "tensor_overflow"
OPERATOR_OVERFLOW = "operator_overflow"
NAN = "nan"
OVERFLOW_ASCEND_CHIP = "overflow"
INF = "inf"
MAX_GT = "max_gt"
MAX_LT = "max_lt"
MIN_GT = "min_gt"
MIN_LT = "min_lt"
MAX_MIN_GT = "max_min_gt"
MAX_MIN_LT = "max_min_lt"
MEAN_GT = "mean_gt"
MEAN_LT = "mean_lt"
TENSOR_INITIALIZATION = "tensor_initialization"
TENSOR_TOO_LARGE = "tensor_too_large"
TENSOR_TOO_SMALL = "tensor_too_small"
TENSOR_ALL_ZERO = "tensor_all_zero"
WEIGHT_NOT_CHANGED = "weight_not_changed"
WEIGHT_CHANGE_TOO_LARGE = "weight_change_too_large"
WEIGHT_CHANGE_TOO_SMALL = "weight_change_too_small"
TENSOR_CHANGE_TOO_LARGE = "tensor_change_too_large"
TENSOR_CHANGE_TOO_SMALL = "tensor_change_too_small"
TENSOR_NOT_CHANGED = "tensor_not_changed"


class OptimizePhaseEnum(Enum):
"""Optimize phases."""
TENSOR_CHECK = 400
OPERATOR_CHECK = 100
LOSS_CHECK = 300
INPUT_DATA_CHECK = 200


class ValueTypeEnum(Enum):
"""Value types."""
FLOAT64 = 1
INT64 = 2
BOOL = 3


class PlatformEnum(Enum):
"""Platform types."""
GPU = "GPU"
ASCEND = "Ascend"


class TargetTypeEnum(Enum):
"""Target types."""
TENSOR = 'tensor'
WEIGHT = 'weight'
ACTIVATION = 'activation'
GRADIENT = 'gradient'


class ConditionContext:
"""
The class for condition context.

Args:
backend (str): parameter name.
step (int): the type of value.
debugger_capability (tuple): whether the param support no assignment.
"""
def __init__(self, backend, step=0, debugger_capability=(1, 0)):
self._backend = backend
self._step = step
self._debugger_capability = debugger_capability

@property
def backend(self):
"""Get backend."""
return self._backend

@property
def step(self):
"""Get _step."""
return self._step

@property
def debugger_capability(self):
"""Get debugger_capability."""
return self._debugger_capability


class ConditionParameter:
"""
The class for parameters of conditions.

Args:
name (str): parameter name.
value_type (ValueTypeEnum): the type of value.
support_disable (bool): whether the param support no assignment.
default_value (float): default value.
visible_on_ui (bool): whether the param visible on ui.
"""
def __init__(self, name, value_type: ValueTypeEnum, support_disable=True, default_value=None, visible_on_ui=True):
self._name = name
self._type = value_type
self._support_disable = support_disable
self._default_value = default_value
self._visible_on_ui = visible_on_ui

@property
def name(self):
"""Get name of parameter."""
return self._name

@property
def type(self):
"""Get type of parameter."""
return self._type

@property
def support_disable(self):
"""Get support_disable of parameter."""
return self._support_disable

@property
def default_value(self):
"""Get default_value of parameter."""
return self._default_value

@property
def visible_on_ui(self):
"""Get visible_on_ui of parameter."""
return self._visible_on_ui


class Condition:
"""
The class for parameters of conditions.

Args:
condition_id (str): condition id.
abbr (str): the abbreviation of condition id.
optimize_phase (OptimizePhaseEnum): optimize phase.
parameters (List[ConditionParameter]): parameters.
supported_target_type (TargetTypeEnum): the supported target type.
supported_platforms (tuple[PlatformEnum, PlatformEnum]): the supported platforms.
minimum_debugger_capability (tuple): the minimum debugger capability required.
available_test_func (func): the function used to test whether the condition is available
"""
def __init__(self, condition_id, abbr, optimize_phase, parameters, supported_target_type, supported_platforms,
minimum_debugger_capability, available_test_func=None):
self.id = condition_id
self._abbr = abbr
self.optimize_phase = optimize_phase
self._parameters = {
parameter.name: parameter for parameter in parameters
}
self._supported_target_type = supported_target_type
self.supported_platforms = supported_platforms
self.minimum_debugger_capability = minimum_debugger_capability
self.available_test_func = available_test_func

def get_parameter_definition(self, name):
"""Return parameter definition by the name"""
return self._parameters[name]

def is_available(self, condition_context):
"""Check is the condition available."""
backend = condition_context.backend
debugger_capability = condition_context.debugger_capability
if debugger_capability < self.minimum_debugger_capability:
logger.debug("The debugger capability is lower than the minimum debugger capability.")
return False
if backend not in [platform.value for platform in self.supported_platforms]:
logger.debug("The condition %s is not supported on the platform.", self.id)
return False
if self.available_test_func is None:
return True
return self.available_test_func(condition_context)

@property
def abbr(self):
"""The abbreviation of condition"""
return self._abbr

@property
def names(self):
"""The name of condition"""
return self._parameters.keys()

@property
def parameters(self):
"""The parameters of condition"""
return self._parameters.values()

@property
def supported_target_type(self):
"""The supported target type of condition"""
return self._supported_target_type


def check_initialization_available(condition_context):
"""Check if initialization is available at this step"""
if condition_context.step == 0:
return True
return False

+ 599
- 0
mindinsight/conditionmgr/condition_list.py View File

@@ -0,0 +1,599 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Condition list.

This module provide the detail conditions list.
"""
from mindinsight.conditionmgr.condition import Condition
from mindinsight.conditionmgr.condition import OptimizePhaseEnum
from mindinsight.conditionmgr.condition import ConditionParameter
from mindinsight.conditionmgr.condition import ValueTypeEnum
from mindinsight.conditionmgr.condition import TargetTypeEnum
from mindinsight.conditionmgr.condition import PlatformEnum
from mindinsight.conditionmgr.condition import check_initialization_available

CONDITION_LIST = [
Condition(
condition_id="weight_initialization",
abbr="WI",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_initialization
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="zero_percentage_ge",
value_type=ValueTypeEnum.FLOAT64,
default_value=100
),
ConditionParameter(
name="max_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_lt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.WEIGHT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1),
available_test_func=check_initialization_available
),
Condition(
condition_id="weight_overflow",
abbr="WO",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_general_overflow
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="weight_too_large",
abbr="WL",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_too_large
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_mean_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="max_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="mean_gt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.WEIGHT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="weight_too_small",
abbr="WS",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_too_small
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_mean_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="max_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="mean_lt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.WEIGHT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="gradient_vanishing",
abbr="GV",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_too_small
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_mean_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="max_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="mean_lt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.GRADIENT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="gradient_too_large",
abbr="GL",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_too_large
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_mean_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="max_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="mean_gt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.GRADIENT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="gradient_exploding",
abbr="GE",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_general_overflow
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.GRADIENT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="tensor_overflow",
abbr="TO",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_general_overflow
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="operator_overflow",
abbr="OO",
# Send this condition to MindSpore will use WatchCondition.Condition.overflow
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND,),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="nan",
abbr="NAN",
# Send this condition to MindSpore will use WatchCondition.Condition.nan
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.GPU,),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="overflow",
abbr="OVERFLOW",
# Send this condition to MindSpore will use WatchCondition.Condition.overflow
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND,),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="inf",
abbr="INF",
# Send this condition to MindSpore will use WatchCondition.Condition.inf
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="max_gt",
abbr="MAX>",
# Send this condition to MindSpore will use WatchCondition.Condition.max_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="max_lt",
abbr="MAX<",
# Send this condition to MindSpore will use WatchCondition.Condition.max_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="min_gt",
abbr="MIN>",
# Send this condition to MindSpore will use WatchCondition.Condition.min_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="min_lt",
abbr="MIN<",
# Send this condition to MindSpore will use WatchCondition.Condition.min_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="max_min_gt",
abbr="MAX-MIN>",
# Send this condition to MindSpore will use WatchCondition.Condition.max_min_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="max_min_lt",
abbr="MAX-Min<",
# Send this condition to MindSpore will use WatchCondition.Condition.max_min_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="mean_gt",
abbr="MEAN>",
# Send this condition to MindSpore will use WatchCondition.Condition.mean_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="mean_lt",
abbr="MEAN<",
# Send this condition to MindSpore will use WatchCondition.Condition.mean_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id="tensor_initialization",
abbr="TI",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_initialization
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="zero_percentage_ge",
value_type=ValueTypeEnum.FLOAT64,
default_value=100
),
ConditionParameter(
name="max_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_lt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1),
available_test_func=check_initialization_available
),
Condition(
condition_id="tensor_too_large",
abbr="TL",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_too_large
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_mean_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="max_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_gt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="mean_gt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="tensor_too_small",
abbr="TS",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_too_small
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_mean_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="max_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="min_lt",
value_type=ValueTypeEnum.FLOAT64
),
ConditionParameter(
name="mean_lt",
value_type=ValueTypeEnum.FLOAT64
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="tensor_all_zero",
abbr="TZ",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_all_zero
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="zero_percentage_ge",
value_type=ValueTypeEnum.FLOAT64,
default_value=100
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="weight_not_changed",
abbr="WNC",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_not_changed
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="rtol",
value_type=ValueTypeEnum.FLOAT64,
default_value=1e-5
),
ConditionParameter(
name="atol",
value_type=ValueTypeEnum.FLOAT64,
support_disable=False,
default_value=1e-8,
visible_on_ui=False
),
ConditionParameter(
name="equal_nan",
value_type=ValueTypeEnum.BOOL,
support_disable=False,
default_value=False,
visible_on_ui=False
)
],
supported_target_type=TargetTypeEnum.WEIGHT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="weight_change_too_large",
abbr="WCL",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_change_too_large
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_update_ratio_mean_gt",
value_type=ValueTypeEnum.FLOAT64,
default_value=1e-1
),
ConditionParameter(
name="epsilon",
value_type=ValueTypeEnum.FLOAT64,
support_disable=False,
default_value=1e-9,
visible_on_ui=False
)
],
supported_target_type=TargetTypeEnum.WEIGHT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="weight_change_too_small",
abbr="WCS",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_change_too_small
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_update_ratio_mean_lt",
value_type=ValueTypeEnum.FLOAT64,
default_value=1e-4
),
ConditionParameter(
name="epsilon",
value_type=ValueTypeEnum.FLOAT64,
support_disable=False,
default_value=1e-9,
visible_on_ui=False
)
],
supported_target_type=TargetTypeEnum.WEIGHT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="tensor_change_too_large",
abbr="TCL",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_change_too_large
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_update_ratio_mean_gt",
value_type=ValueTypeEnum.FLOAT64,
default_value=1e-1
),
ConditionParameter(
name="epsilon",
value_type=ValueTypeEnum.FLOAT64,
support_disable=False,
default_value=1e-9,
visible_on_ui=False
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="tensor_change_too_small",
abbr="TCS",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_change_too_small
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="abs_update_ratio_mean_lt",
value_type=ValueTypeEnum.FLOAT64,
default_value=1e-4
),
ConditionParameter(
name="epsilon",
value_type=ValueTypeEnum.FLOAT64,
support_disable=False,
default_value=1e-9,
visible_on_ui=False
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id="tensor_not_changed",
abbr="TNC",
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_not_changed
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="rtol",
value_type=ValueTypeEnum.FLOAT64,
default_value=1e-5
),
ConditionParameter(
name="atol",
value_type=ValueTypeEnum.FLOAT64,
support_disable=False,
default_value=1e-8,
visible_on_ui=False
),
ConditionParameter(
name="equal_nan",
value_type=ValueTypeEnum.BOOL,
support_disable=False,
default_value=False,
visible_on_ui=False
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
)
]

+ 132
- 0
mindinsight/conditionmgr/conditionmgr.py View File

@@ -0,0 +1,132 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Condition manager..

This module provide condition manager function.
"""
from mindinsight.conditionmgr.condition import Condition
from mindinsight.conditionmgr.condition import TargetTypeEnum
from mindinsight.conditionmgr.condition_list import CONDITION_LIST
from mindinsight.conditionmgr.log import logger


class ConditionMgr:
"""Condition manager."""

def __init__(self):
self.conditions = {}
self.no_parameter_conditions = []
self._register_default_conditions()

def _register_default_conditions(self):
"""Register default condition definitions"""
self.register_conditions(CONDITION_LIST)

def register_condition(self, condition):
"""Register conditions into dict"""
if not condition.parameters:
self.no_parameter_conditions.append(condition.id)
self.conditions[condition.id] = condition

def register_conditions(self, conditions):
"""Register conditions"""
for condition in conditions:
self.register_condition(condition)

def get_all(self, condition_context):
"""Get all register conditions."""
conditions = []
for condition in self.conditions.values():
parameters = []
if not condition.is_available(condition_context):
continue
for param in condition.parameters:
if not param.visible_on_ui:
continue
parameters.append({
"name": param.name,
"type": param.type.name,
"support_disable": param.support_disable,
"default_value": param.default_value
})
conditions.append({
"id": condition.id,
"parameters": parameters,
"supported_target_type": condition.supported_target_type.name
})
conditions = sorted(conditions, key=lambda x: x.get('id'))
return {"conditions": conditions}

def get_condition(self, condition_id) -> Condition:
"""Get condition by condition id"""
return self.conditions[condition_id]

def has_condition(self, condition_id, condition_context) -> bool:
"""Return if the condition exist and avilible"""
if condition_id in self.conditions:
condition = self.get_condition(condition_id)
return condition.is_available(condition_context)
logger.warning("Condition id %s not found.", condition_id)
return False

def get_no_param_condition(self) -> list:
"""Return the list of condition without parameters"""
return self.no_parameter_conditions

@staticmethod
def check_and_sort(collections, target_type, reply):
"""Check the collection and sort conditions"""
collection = collections.get(target_type)
if collection:
collection = sorted(collection, key=lambda x: x.get('id'))
reply.append({"id": target_type + "_condition_collection", "conditions": collection})
else:
logger.warning("Condition collection for %s is None.", target_type)

def get_all_collections(self, condition_context):
"""Get all register conditions."""

collections = {
TargetTypeEnum.WEIGHT.value: [], TargetTypeEnum.TENSOR.value: [], TargetTypeEnum.GRADIENT.value: [],
TargetTypeEnum.ACTIVATION.value: []
}
for condition in self.conditions.values():
parameters = []
if not condition.is_available(condition_context):
continue
for param in condition.parameters:
if not param.visible_on_ui:
continue
parameters.append({
"name": param.name,
"type": param.type.name,
"support_disable": param.support_disable,
"default_value": param.default_value
})
collections[condition.supported_target_type.value].append({
"id": condition.id,
"parameters": parameters,
"supported_target_type": condition.supported_target_type.name,
"abbr": condition.abbr
})

reply = []
self.check_and_sort(collections, TargetTypeEnum.ACTIVATION.value, reply)
self.check_and_sort(collections, TargetTypeEnum.GRADIENT.value, reply)
self.check_and_sort(collections, TargetTypeEnum.TENSOR.value, reply)
self.check_and_sort(collections, TargetTypeEnum.WEIGHT.value, reply)

return reply

+ 19
- 0
mindinsight/conditionmgr/log.py View File

@@ -0,0 +1,19 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Log module"""

from mindinsight.utils.log import setup_logger

logger = setup_logger(sub_module="conditionmgr", log_name="conditionmgr")

+ 365
- 0
mindinsight/conditionmgr/recommender.py View File

@@ -0,0 +1,365 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Predefined watchpoints.

This module predefine recommend watchpoints.
"""
import queue as Queue

from mindinsight.conditionmgr.conditionmgr import ConditionMgr
from mindinsight.conditionmgr.condition import TargetTypeEnum
from mindinsight.conditionmgr.condition import ConditionIdEnum
from mindinsight.conditionmgr.common.utils import NodeBasicInfo
from mindinsight.conditionmgr.log import logger
from mindinsight.conf import settings


UNSELECTED_STATUS = 0
HALF_SELECTED_STATUS = 1
SELECTED_STATUS = 2


class _WatchPointData:
"""WatchPoint data container"""
def __init__(self, watch_condition, watch_nodes):
self.watch_condition = watch_condition
self.watch_nodes = watch_nodes

def get_watch_condition_dict(self):
return {
"id": self.watch_condition.get("condition"),
"params": [{
"name": param.get_parameter_name(),
"disable": False,
"value": param.value
} for param in self.watch_condition.get("params")]
}


class _ConditionParameterValue:
"""Condition parameter data container"""
def __init__(self, parameter, value):
self.parameter = parameter
self.value = value

def get_parameter_name(self):
return self.parameter.name


def recommend_watchpoints(condition_mgr: ConditionMgr, graph_stream, condition_context):
"""
Recommend watchpoints.

Args:
condition_mgr (ConditionMgr): Condition manager instance.
graph_stream (GraphHandler): Graph handler instance.
condition_context (ConditionContext): Context for condition.

Returns:
list[WatchPointData], watch points to be created.
"""
watch_points = []

if not graph_stream.graph:
logger.warning("Given graph is None.")
return watch_points

if not settings.ENABLE_RECOMMENDED_WATCHPOINTS:
return watch_points

# add weight watch points
merged_info = _get_basic_node_info(TargetTypeEnum.WEIGHT.value, graph_stream)
_recommend_weight_initialization(merged_info, condition_mgr, watch_points, condition_context)
_recommend_weight_change_too_large(merged_info, condition_mgr, watch_points, condition_context)

# Because we cannot identify trainable weights currently, weight_no_change and weight_change_too_small will not be
# recommended.
trainable_weight_nodes = []
_recommend_weight_not_changed(condition_mgr, trainable_weight_nodes, watch_points, condition_context)
_recommend_weight_change_too_small(condition_mgr, trainable_weight_nodes, watch_points, condition_context)

# add gradient watch points
merged_info = _get_basic_node_info(TargetTypeEnum.GRADIENT.value, graph_stream)
_recommend_gradient_vanishing(merged_info, condition_mgr, watch_points, condition_context)

# add tensor watch points
merged_info = _get_basic_node_info(TargetTypeEnum.TENSOR.value, graph_stream)
_recommend_overflow_ascend_chip(merged_info, condition_mgr, watch_points, condition_context)
_recommend_tensor_overflow(merged_info, condition_mgr, watch_points, condition_context)
_recommend_tensor_all_zero(merged_info, condition_mgr, watch_points, condition_context)
return watch_points


def _recommend_tensor_all_zero(basic_info_nodes, condition_mgr, watch_points, condition_context):
"""Recommend tensor all zero watchpoint."""
if not basic_info_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.TENSOR_ALL_ZERO.value, condition_context):
return
condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.TENSOR_ALL_ZERO.value)
tensor_all_zero_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": [_ConditionParameterValue(
parameter=condition.get_parameter_definition("zero_percentage_ge"),
value=100 # set default value to 100
)]
},
watch_nodes=basic_info_nodes.copy(),
)
watch_points.append(tensor_all_zero_watchpoint)


def _recommend_tensor_overflow(basic_info_nodes, condition_mgr, watch_points, condition_context):
"""Recommend tensor general overflow watchpoint."""
if not basic_info_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.TENSOR_OVERFLOW.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.TENSOR_OVERFLOW.value)
overflow_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": []
},
watch_nodes=basic_info_nodes.copy(),
)
watch_points.append(overflow_watchpoint)


def _recommend_overflow_ascend_chip(basic_info_nodes, condition_mgr, watch_points, condition_context):
"""Recommend tensor overflow watchpoint."""
if not basic_info_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value)
overflow_d_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": []
},
watch_nodes=basic_info_nodes.copy(),
)
watch_points.append(overflow_d_watchpoint)


def _recommend_gradient_vanishing(basic_info_nodes, condition_mgr, watch_points, condition_context):
"""Recommend gradient vanishing watchpoint."""
if not basic_info_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.GRADIENT_VANISHING.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.GRADIENT_VANISHING.value)
gradient_vanishing_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": [_ConditionParameterValue(
parameter=condition.get_parameter_definition("abs_mean_lt"),
value=1e-9 # set default value to 1e-9
)]
},
watch_nodes=basic_info_nodes.copy(),
)
watch_points.append(gradient_vanishing_watchpoint)


def _recommend_weight_change_too_small(condition_mgr, trainable_weight_nodes, watch_points, condition_context):
"""Recommend weight change too small watchpoint."""
if not trainable_weight_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.WEIGHT_CHANGE_TOO_SMALL.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.WEIGHT_CHANGE_TOO_SMALL.value)
weight_change_too_small_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": [
_ConditionParameterValue(
parameter=condition.get_parameter_definition("abs_update_ratio_mean_lt"),
value=1.0e-4 # set default value to 1.0e-4
),
]
},
watch_nodes=trainable_weight_nodes,
)
watch_points.append(weight_change_too_small_watchpoint)


def _recommend_weight_not_changed(condition_mgr, trainable_weight_nodes, watch_points, condition_context):
"""Recommend weight not changed watchpoint."""
if not trainable_weight_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.WEIGHT_NOT_CHANGED.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.WEIGHT_NOT_CHANGED.value)
weight_no_change_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": [
_ConditionParameterValue(
parameter=condition.get_parameter_definition("rtol"),
value=1.0e-5 # set default value to 1.0e-5
),
_ConditionParameterValue(
parameter=condition.get_parameter_definition("atol"),
value=1.0e-8 # set default value to 1.0e-8
),
]
},
watch_nodes=trainable_weight_nodes,
)
watch_points.append(weight_no_change_watchpoint)


def _recommend_weight_change_too_large(basic_info_nodes, condition_mgr, watch_points, condition_context):
"""Recommend weight change too large watchpoint."""
if not basic_info_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.WEIGHT_CHANGE_TOO_LARGE.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.WEIGHT_CHANGE_TOO_LARGE.value)
weight_initialization_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": [_ConditionParameterValue(
parameter=condition.get_parameter_definition("abs_update_ratio_mean_gt"),
value=0.1 # set default value to 0.1
)]
},
watch_nodes=basic_info_nodes.copy(),
)
watch_points.append(weight_initialization_watchpoint)


def _recommend_weight_initialization(basic_info_nodes, condition_mgr, watch_points, condition_context):
"""Recommend weight initialization watchpoint."""
if not basic_info_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.WEIGHT_INITIALIZATION.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.WEIGHT_INITIALIZATION.value)
weight_initialization_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": [_ConditionParameterValue(
parameter=condition.get_parameter_definition("zero_percentage_ge"),
value=100 # set default value to 100
)]
},
watch_nodes=basic_info_nodes.copy(),
)
watch_points.append(weight_initialization_watchpoint)


def _get_basic_node_info(node_category, graph_stream):
"""Get node merged info."""
basic_info_nodes = _get_basic_node_info_by_node_category(node_category, graph_stream)
merged_info = _merge_nodes(basic_info_nodes, graph_stream.whole_graph)
merged_info = _add_graph_name(merged_info, graph_stream)
return merged_info


def _get_basic_node_info_by_node_category(node_category, graph_stream):
"""Get node basic info by node category."""
all_graph_nodes = graph_stream.get_searched_nodes(pattern={'node_category': node_category})
basic_info_nodes = []
for graph_name, nodes in all_graph_nodes.items():
if len(all_graph_nodes) == 1:
logger.debug("This is a single graph")
graph_name = ""
for node in nodes:
if graph_name == "":
basic_node_info = NodeBasicInfo(name=node.name, full_name=node.full_name, type=node.type)
else:
basic_node_info = graph_stream.construct_node_basic_info(
full_name=node.full_name, graph_name=graph_name, node_name=node.name, node_type=node.type)
basic_info_nodes.append(basic_node_info)
return basic_info_nodes


def _merge_nodes(leaf_nodes, graph):
"""merge nodes in one graph"""
unmerged_tree = graph.get_nodes(leaf_nodes)
tmp_node_queue = Queue.Queue()

# watch node list in layer order
watch_nodes = []
for node in unmerged_tree:
if node["type"] != "name_scope":
# if node is leaf_node, it is totally chosen
node["status"] = SELECTED_STATUS
else:
# if node is not leaf_node, it is not chosen initially
node["status"] = UNSELECTED_STATUS
tmp_node_queue.put(node)
while not tmp_node_queue.empty():
cur_node = tmp_node_queue.get()
watch_nodes.append(cur_node)
for sub_node in cur_node["nodes"]:
if sub_node["type"] != "name_scope":
# if node is leaf_node, it is totally chosen
sub_node["status"] = SELECTED_STATUS
else:
# if node is not leaf_node, it is not chosen initially
sub_node["status"] = UNSELECTED_STATUS
tmp_node_queue.put(sub_node)

merged_watch_nodes = []
while watch_nodes:
cur_node = watch_nodes.pop()
node_name = cur_node["name"]
sub_count = graph.normal_node_map.get(node_name).subnode_count
if len(cur_node["nodes"]) < sub_count or not cur_node["nodes"]:
continue
is_all_chosen = True
for sub_node in cur_node["nodes"]:
if sub_node["status"] != SELECTED_STATUS:
is_all_chosen = False
break

if is_all_chosen:
cur_node["status"] = SELECTED_STATUS
merged_watch_nodes.append(cur_node)
else:
cur_node["status"] = HALF_SELECTED_STATUS
logger.debug("merged_watch_nodes: %s", merged_watch_nodes)
out_nodes = []
for node_info in merged_watch_nodes:
node_basic_info = NodeBasicInfo(name=node_info["name"], full_name=node_info["name"], type=node_info["type"])
out_nodes.append(node_basic_info)
logger.debug("out_nodes: %s", out_nodes)
return out_nodes


def _add_graph_name(nodes, graph_stream):
"""add graph_name in node.name"""
if len(graph_stream.graph) > 1:
return nodes
graph_name = graph_stream.graph_names[0]
output_nodes = []
for node in nodes:
node_basic_info = graph_stream.construct_node_basic_info(
full_name=node.name, graph_name=graph_name, node_name=node.name, node_type=node.type)
output_nodes.append(node_basic_info)
return output_nodes

+ 2
- 0
mindinsight/conf/constants.py View File

@@ -63,3 +63,5 @@ MAX_GRAPH_STEP_SIZE_PER_TAG = 1
MAX_HISTOGRAM_STEP_SIZE_PER_TAG = 50
MAX_TENSOR_STEP_SIZE_PER_TAG = 20
MAX_TENSOR_RESPONSE_DATA_SIZE = 100000

ENABLE_RECOMMENDED_WATCHPOINTS = False

+ 2
- 2
mindinsight/datavisual/data_transform/graph/msgraph.py View File

@@ -54,7 +54,7 @@ class MSGraph(Graph):
node_protos (list[anf_ir_pb2.NodeProto]): Refer to anf_ir_pb2.NodeProto.
"""
logger.debug("Start to parse op nodes from proto.")
for node_proto in node_protos:
for topological_index, node_proto in enumerate(node_protos):
if not node_proto.name:
logger.warning("Finding a node with an empty name will not save it.")
continue
@@ -69,7 +69,7 @@ class MSGraph(Graph):
# The Graphviz plug-in that the UI USES can't handle these special characters.
check_invalid_character(node_name)

node = Node(name=node_name, node_id=node_proto.name)
node = Node(name=node_name, node_id=node_proto.name, topological_index=topological_index)
node.full_name = node_proto.full_name
node.type = node_proto.op_type



+ 3
- 1
mindinsight/datavisual/data_transform/graph/node.py View File

@@ -35,7 +35,7 @@ class Node:
node_id (str): The id of this node, and node id is unique in graph.
"""

def __init__(self, name, node_id):
def __init__(self, name, node_id, topological_index=-1):
self._node_id = node_id
self.name = name
self.type = ""
@@ -53,6 +53,8 @@ class Node:
self.output_nums = 0
self.elem_types = []
self.full_name = ""
# This value will be used as the priority field.
self.topological_index = topological_index

def to_dict(self):
"""Converts the node object to dictionary format."""


+ 1
- 1
mindinsight/datavisual/data_transform/graph/node_tree.py View File

@@ -16,7 +16,7 @@
This file is used to define the node of graph and associated base types.
"""
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log


class NodeTree:


+ 2
- 0
mindinsight/debugger/common/exceptions/error_code.py View File

@@ -37,6 +37,7 @@ class DebuggerErrors(DebuggerErrorCodes):
CONTINUE_ERROR = 3 | _DEBUGGER_RUNNING_ERROR
PAUSE_ERROR = 4 | _DEBUGGER_RUNNING_ERROR
COMPARE_TENSOR_ERROR = 5 | _DEBUGGER_RUNNING_ERROR
RECHECK_ERROR = 6 | _DEBUGGER_RUNNING_ERROR


@unique
@@ -52,3 +53,4 @@ class DebuggerErrorMsg(Enum):
DELETE_WATCHPOINT_ERROR = "Delete watchpoint failed. {}"
CONTINUE_ERROR = "Continue debugging failed. {}"
PAUSE_ERROR = "Pause debugging failed. {}"
RECHECK_ERROR = "Recheck failed. {}"

+ 11
- 0
mindinsight/debugger/common/exceptions/exceptions.py View File

@@ -72,6 +72,17 @@ class DebuggerDeleteWatchPointError(MindInsightException):
)


class DebuggerRecheckError(MindInsightException):
"""The error about deleting watch point."""

def __init__(self, msg):
super(DebuggerRecheckError, self).__init__(
error=DebuggerErrors.RECHECK_ERROR,
message=DebuggerErrorMsg.RECHECK_ERROR.value.format(msg),
http_code=400
)


class DebuggerCompareTensorError(MindInsightException):
"""The error about comparing tensors."""



+ 1
- 1
mindinsight/debugger/common/log.py View File

@@ -17,4 +17,4 @@ from mindinsight.utils.log import setup_logger

LOG_NAME = "debugger"
LOG_MODULE = "debugger"
logger = setup_logger(sub_module=LOG_MODULE, log_name=LOG_NAME)
LOGGER = setup_logger(sub_module=LOG_MODULE, log_name=LOG_NAME)

+ 7
- 4
mindinsight/debugger/common/utils.py View File

@@ -14,7 +14,6 @@
# ============================================================================
"""Define the utils."""
import enum
from collections import namedtuple

import numpy as np

@@ -72,7 +71,12 @@ class Streams(enum.Enum):
WATCHPOINT_HIT = 'watchpoint_hit'


NodeBasicInfo = namedtuple('node_basic_info', ['name', 'full_name', 'type'])
class RunLevel(enum.Enum):
"""Run Level enum, it depends on whether the program is executed node by node,
step by step, or in recheck phase"""
NODE = "node"
STEP = "step"
RECHECK = "recheck"


def get_ack_reply(state=0):
@@ -140,5 +144,4 @@ def create_view_event_from_tensor_history(tensor_history):

def is_scope_type(node_type):
"""Judge whether the type is scope type."""
scope_types = [NodeTypeEnum.NAME_SCOPE.value, NodeTypeEnum.AGGREGATION_SCOPE.value]
return node_type in scope_types
return node_type.endswith('scope')

+ 1
- 1
mindinsight/debugger/debugger_cache.py View File

@@ -15,7 +15,7 @@
"""Implement the debugger data cache manager."""
import sys

from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import Streams
from mindinsight.debugger.stream_handler import EventHandler, MetadataHandler, GraphHandler, \
TensorHandler, WatchpointHandler, WatchpointHitHandler


+ 92
- 23
mindinsight/debugger/debugger_grpc_server.py View File

@@ -15,11 +15,13 @@
"""Implement the debugger grpc server."""
from functools import wraps

from mindinsight.debugger.common.log import logger as log
import mindinsight.conditionmgr.recommender
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import get_ack_reply, ServerStatus, \
Streams
Streams, RunLevel
from mindinsight.debugger.proto import debug_grpc_pb2_grpc as grpc_server_base
from mindinsight.debugger.proto.ms_graph_pb2 import GraphProto
from mindinsight.conditionmgr.condition import ConditionContext


def debugger_wrap(func):
@@ -39,7 +41,7 @@ def debugger_wrap(func):
class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
"""The grpc server used to interactive with grpc client."""

def __init__(self, cache_store):
def __init__(self, cache_store, condition_mgr):
"""
Initialize.

@@ -48,6 +50,7 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
"""
cache_store.initialize()
self._cache_store = cache_store
self._condition_mgr = condition_mgr
# the next position of command queue to be queried
self._pos = None
# the status of grpc server, the value is in ServerStatus
@@ -66,7 +69,7 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
self._status = ServerStatus.PENDING
self._old_run_cmd = {}
self._received_view_cmd = {}
self._received_hit = False
self._received_hit = []
self._cache_store.clean()

@debugger_wrap
@@ -90,25 +93,46 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
reply = get_ack_reply(1)
log.warning("Failed to get command event.")
else:
log.info("Reply to WaitCMD: %s", reply)
log.debug("Reply to WaitCMD: %s", reply)
return reply

def _add_predefined_watchpoints(self, condition_context):
"""Add predefined watchpoints."""
log.debug("Add predefined watchpoints.")
graph_stream = self._cache_store.get_stream_handler(Streams.GRAPH)
watchpoints = mindinsight.conditionmgr.recommender.recommend_watchpoints(self._condition_mgr, graph_stream,
condition_context)
watch_point_stream_handler = self._cache_store.get_stream_handler(Streams.WATCHPOINT)
for watchpoint in watchpoints:
watch_point_stream_handler.create_watchpoint(
watch_condition=watchpoint.get_watch_condition_dict(),
watch_nodes=watchpoint.watch_nodes,
condition_mgr=self._condition_mgr
)

def _pre_process(self, request):
"""Pre-process before dealing with command."""
metadata_stream = self._cache_store.get_stream_handler(Streams.METADATA)
watchpoint_stream = self._cache_store.get_stream_handler(Streams.WATCHPOINT)
is_new_step = metadata_stream.step < request.cur_step
is_new_node = metadata_stream.full_name != request.cur_node
# clean cache data at the beginning of new step
# clean cache data at the beginning of new step or node has been changed.
if is_new_step or is_new_node:
self._cache_store.clean_data()
if is_new_step:
self._cache_store.get_stream_handler(Streams.WATCHPOINT_HIT).clean()
self._cache_store.get_stream_handler(Streams.TENSOR).clean_tensors(request.cur_step)
watchpoint_stream.clean_temp_cached_names()
# receive graph at the beginning of the training
if self._status == ServerStatus.RECEIVE_GRAPH:
condition_context = ConditionContext(backend=request.backend, debugger_capability=(1, 0))
self._add_predefined_watchpoints(condition_context)
self._send_graph_flag(metadata_stream)
# receive new metadata
if is_new_step or is_new_node:
self._update_metadata(metadata_stream, request)
# save the full name of the node which MindSpore has stored the tensor.
watchpoint_stream.add_temp_cached_name(request.cur_node)
self._send_received_tensor_tag()
self._send_watchpoint_hit_flag()

@@ -139,9 +163,14 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
"""
# put new metadata into cache
metadata_stream.put(metadata_proto)
cur_node = self._cache_store.get_stream_handler(Streams.GRAPH).get_node_name_by_full_name(
metadata_proto.cur_node) if metadata_proto.cur_node else ''
# update current node name and graph name
graph_stream = self._cache_store.get_stream_handler(Streams.GRAPH)
full_name = metadata_proto.cur_node
graph_name = graph_stream.get_graph_id_by_full_name(
full_name) if full_name else metadata_stream.graph_name
cur_node = graph_stream.get_node_name_by_full_name(full_name, graph_name)
metadata_stream.node_name = cur_node
metadata_stream.graph_name = graph_name
metadata = metadata_stream.get()
self._cache_store.put_data(metadata)
log.debug("Put new metadata into data queue.")
@@ -151,7 +180,7 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
node_name = self._received_view_cmd.get('node_name')
if not node_name or self._received_view_cmd.get('wait_for_tensor'):
return
metadata = self._cache_store.get_stream_handler(Streams.METADATA).get()
metadata = self._cache_store.get_stream_handler(Streams.METADATA).get(['step', 'state'])
ret = {'receive_tensor': {'node_name': node_name}}
ret.update(metadata)
self._cache_store.put_data(ret)
@@ -161,9 +190,12 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
def _send_watchpoint_hit_flag(self):
"""Send Watchpoint hit flag."""
watchpoint_hit_stream = self._cache_store.get_stream_handler(Streams.WATCHPOINT_HIT)
if watchpoint_hit_stream.empty or not self._received_hit:
if not self._received_hit:
return
self._received_hit = False
watchpoint_hits = self._received_hit
self._received_hit = []
for watchpoint_hit in watchpoint_hits:
watchpoint_hit_stream.put(watchpoint_hit)
watchpoint_hits_info = watchpoint_hit_stream.get()
self._cache_store.put_data(watchpoint_hits_info)
log.debug("Send the watchpoint hits to DataQueue.\nSend the reply.")
@@ -187,7 +219,6 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
event = self._deal_with_left_continue_step(left_step_count)
else:
event = self._deal_with_left_continue_node(node_name)
self._cache_store.get_stream_handler(Streams.WATCHPOINT_HIT).clean()
log.debug("Send old RunCMD. Clean watchpoint hit.")
return event

@@ -260,7 +291,10 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
event = self._deal_with_run_cmd(event)
elif event.HasField('exit'):
self._cache_store.clean()
log.info("Clean cache for exit cmd.")
log.debug("Clean cache for exit cmd.")
else:
self._cache_store.get_stream_handler(Streams.WATCHPOINT).clean_cache_set_cmd(event.set_cmd)
log.debug("get set cmd.")

return event

@@ -294,7 +328,9 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
elif run_cmd.node_name:
self._old_run_cmd['node_name'] = run_cmd.node_name
run_cmd.node_name = ''
self._cache_store.get_stream_handler(Streams.WATCHPOINT_HIT).clean()
# clean watchpoint hit cache
if run_cmd.run_level == RunLevel.RECHECK.value:
self._cache_store.get_stream_handler(Streams.WATCHPOINT_HIT).clean()
log.debug("Receive RunCMD. Clean watchpoint hit cache.")

return event
@@ -330,9 +366,34 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
for chunk in request_iterator:
serial_graph += chunk.buffer
graph = GraphProto.FromString(serial_graph)
log.debug("Deserialize the graph. Receive %s nodes", len(graph.node))
self._cache_store.get_stream_handler(Streams.GRAPH).put(graph)
log.debug("Deserialize the graph %s. Receive %s nodes", graph.name, len(graph.node))
graph_dict = {graph.name: graph}
self._cache_store.get_stream_handler(Streams.GRAPH).put(graph_dict)
self._cache_store.get_stream_handler(Streams.TENSOR).put_const_vals(graph.const_vals)
self._cache_store.get_stream_handler(Streams.METADATA).graph_name = graph.name
self._status = ServerStatus.RECEIVE_GRAPH
reply = get_ack_reply()
log.debug("Send the reply for graph.")
return reply

@debugger_wrap
def SendMultiGraphs(self, request_iterator, context):
"""Send graph into DebuggerCache."""
log.info("Received graph.")
serial_graph = b""
graph_dict = {}
for chunk in request_iterator:
serial_graph += chunk.buffer
if chunk.finished:
sub_graph = GraphProto.FromString(serial_graph)
graph_dict[sub_graph.name] = sub_graph
log.debug("Deserialize the graph %s. Receive %s nodes", sub_graph.name,
len(sub_graph.node))
serial_graph = b""
self._cache_store.get_stream_handler(Streams.TENSOR).put_const_vals(
sub_graph.const_vals)

self._cache_store.get_stream_handler(Streams.GRAPH).put(graph_dict)
self._status = ServerStatus.RECEIVE_GRAPH
reply = get_ack_reply()
log.debug("Send the reply for graph.")
@@ -365,22 +426,30 @@ class DebuggerGrpcServer(grpc_server_base.EventListenerServicer):
"""Send watchpoint hits info DebuggerCache."""
log.info("Received WatchpointHits. Left run cmd %s change to emtpy.", self._old_run_cmd)
self._old_run_cmd.clear()
self._received_hit = True
watchpoint_hit_stream = self._cache_store.get_stream_handler(Streams.WATCHPOINT_HIT)
if self._cache_store.get_stream_handler(Streams.METADATA).state == ServerStatus.RUNNING.value:
# if the client session is running a script, all the cached command should be cleared
# when received watchpoint_hits.
self._cache_store.clean_command()

# save the watchpoint_hits data
watchpoint_hits = []
watchpoint_stream = self._cache_store.get_stream_handler(Streams.WATCHPOINT)
graph_stream = self._cache_store.get_stream_handler(Streams.GRAPH)
for watchpoint_hit_proto in request_iterator:
ui_node_name = graph_stream.get_node_name_by_full_name(
watchpoint_hit_proto.tensor.node_name)
node_full_name = watchpoint_hit_proto.tensor.node_name
graph_name = graph_stream.get_graph_id_by_full_name(node_full_name)
ui_node_name = graph_stream.get_node_name_by_full_name(node_full_name, graph_name)
log.debug("Receive watch point hit: %s", watchpoint_hit_proto)
if not ui_node_name:
log.info("Not support to show %s on graph.", watchpoint_hit_proto.tensor.node_name)
log.info("Not support to show %s on graph.", node_full_name)
continue
watchpoint_hit = {
'tensor_proto': watchpoint_hit_proto.tensor,
'watchpoint': watchpoint_stream.get_watchpoint_by_id(watchpoint_hit_proto.id),
'node_name': ui_node_name
'node_name': ui_node_name,
'graph_name': graph_name
}
watchpoint_hit_stream.put(watchpoint_hit)
watchpoint_hits.append(watchpoint_hit)
self._received_hit = watchpoint_hits
reply = get_ack_reply()
return reply

+ 299
- 118
mindinsight/debugger/debugger_server.py View File

@@ -16,23 +16,26 @@
import signal
from concurrent import futures
from threading import Thread

import grpc

from mindinsight.conditionmgr.conditionmgr import ConditionMgr
from mindinsight.conditionmgr.condition import ConditionContext, ConditionIdEnum
from mindinsight.conf import settings
from mindinsight.datavisual.data_transform.graph import NodeTypeEnum
from mindinsight.datavisual.utils.tools import to_float
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError, \
DebuggerParamTypeError, DebuggerCreateWatchPointError, DebuggerUpdateWatchPointError, \
DebuggerDeleteWatchPointError, DebuggerContinueError, DebuggerPauseError, \
DebuggerCompareTensorError
from mindinsight.debugger.common.log import logger as log
DebuggerCompareTensorError, DebuggerRecheckError
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import get_ack_reply, ServerStatus, \
create_view_event_from_tensor_history, Streams, is_scope_type, NodeBasicInfo
create_view_event_from_tensor_history, Streams, is_scope_type, RunLevel
from mindinsight.conditionmgr.common.utils import NodeBasicInfo
from mindinsight.debugger.debugger_cache import DebuggerCache
from mindinsight.debugger.debugger_grpc_server import DebuggerGrpcServer
from mindinsight.debugger.proto import debug_grpc_pb2_grpc as grpc_server_base
from mindinsight.debugger.proto.debug_grpc_pb2 import RunCMD
from mindinsight.debugger.stream_operator.tensor_detail_info import TensorDetailInfo
from mindinsight.utils.exceptions import MindInsightException
from mindinsight.utils.tensor import TensorUtils, MAX_DIMENSIONS_FOR_TENSOR

@@ -42,11 +45,26 @@ class DebuggerServer:

def __init__(self, grpc_port=None):
self.grpc_port = grpc_port
self.condition_mgr = ConditionMgr()
self.cache_store = DebuggerCache()
self.grpc_server = DebuggerGrpcServer(self.cache_store)
self.grpc_server = DebuggerGrpcServer(self.cache_store, self.condition_mgr)
self.grpc_server_manager = None
self.back_server = None

def get_conditions(self, train_id):
"""Get all default conditions"""
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0))
log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend)
return self.condition_mgr.get_all(condition_context)

def get_condition_collections(self, train_id):
"""Get default condition_collections"""
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0))
log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend)
return self.condition_mgr.get_all_collections(condition_context)

def start(self):
"""Start server."""
grpc_port = self.grpc_port if self.grpc_port else "50051"
@@ -97,24 +115,34 @@ class DebuggerServer:

return reply

def search(self, name, watch_point_id=0):
def search(self, filter_condition):
"""
Search for single node in graph.

Args:
name (str): The name pattern.
watch_point_id (int): The id of watchpoint. Default: 0.
filter_condition (dict): Filter condition.

- name (str): The name pattern.
- graph_name (str): The graph name.
- watch_point_id (int): The id of watchpoint. Default: 0.
- node_category (str): The node_category. Default: None

Returns:
dict, the searched nodes.
"""
log.info("receive search request for node:%s, in watchpoint:%d", name, watch_point_id)
log.info("receive search request with filter_condition: %s", filter_condition)
# validate watchpoint id
watch_point_id = filter_condition.pop('watch_point_id', 0)
watchpoint_stream = self.cache_store.get_stream_handler(Streams.WATCHPOINT)
watchpoint_stream.validate_watchpoint_id(watch_point_id)
# validate and update graph name
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
graph = graph_stream.search_nodes(name)
graph_name = graph_stream.validate_graph_name(filter_condition.get('graph_name'))
filter_condition['graph_name'] = graph_name
# get searched graph
graph = graph_stream.search_nodes(filter_condition)
# add watched label to graph
watchpoint_stream.set_watch_nodes(graph, graph_stream, watch_point_id)
watchpoint_stream.set_watch_nodes(graph, graph_stream, watch_point_id, graph_name)
return graph

def tensor_comparisons(self, name, shape, detail='data', tolerance='0'):
@@ -150,7 +178,8 @@ class DebuggerServer:
if node_type == NodeTypeEnum.PARAMETER.value:
reply = tensor_stream.get_tensors_diff(tensor_name, parsed_shape, tolerance)
else:
raise DebuggerParamValueError("The node type must be parameter, but got {}.".format(node_type))
raise DebuggerParamValueError(
"The node type must be parameter, but got {}.".format(node_type))
return reply

def retrieve(self, mode, filter_condition=None):
@@ -196,10 +225,13 @@ class DebuggerServer:
self.cache_store.clean_data()
log.info("Clean data queue cache when retrieve all request.")
result = {}
for stream in [Streams.METADATA, Streams.GRAPH, Streams.WATCHPOINT]:
for stream in [Streams.METADATA, Streams.GRAPH]:
sub_res = self.cache_store.get_stream_handler(stream).get()
result.update(sub_res)

sub_res = self._hide_parameters_for_ui()
result.update(sub_res)

return result

def _retrieve_node(self, filter_condition):
@@ -210,10 +242,9 @@ class DebuggerServer:
filter_condition (dict): Filter condition.

- name (str): The name of single node.
- graph_name (str): The relative graph_name of the node.
- single_node (bool): If False, return the sub-layer of single node. If True, return
the node list from root node to single node.

- watch_point_id (int): The id of watchpoint.

Returns:
@@ -222,9 +253,13 @@ class DebuggerServer:
log.debug("Retrieve node %s.", filter_condition)
# validate node name
node_name = filter_condition.get('name')
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
graph_name = graph_stream.validate_graph_name(filter_condition.get('graph_name'))
if node_name:
self.cache_store.get_stream_handler(Streams.GRAPH).get_node_type(node_name)
# validate node name
graph_stream.get_node_type(node_name, graph_name)
filter_condition['single_node'] = bool(filter_condition.get('single_node'))
filter_condition['graph_name'] = graph_name
reply = self._get_nodes_info(filter_condition)
return reply

@@ -236,10 +271,9 @@ class DebuggerServer:
filter_condition (dict): The filter condition.

- name (str): The node name.
- graph_name (str): The relative graph_name of the node.
- single_node (bool): If False, return the sub-layer of single node. If True, return
the node list from root node to single node.

- watch_point_id (int): The id of watchpoint.

Returns:
@@ -254,15 +288,16 @@ class DebuggerServer:
reply = graph_stream.get(filter_condition)
graph = reply.get('graph')
# add watched label to graph
watchpoint_stream.set_watch_nodes(graph, graph_stream, watch_point_id)
watchpoint_stream.set_watch_nodes(graph, graph_stream, watch_point_id, filter_condition.get('graph_name'))
return reply

def retrieve_tensor_history(self, node_name):
def retrieve_tensor_history(self, node_name, graph_name=None):
"""
Retrieve tensor history for leaf node.

Args:
node_name (str): The name of leaf node.
graph_name (str): The graph name. Default: None.

Returns:
dict, the tensor history and metadata.
@@ -271,39 +306,31 @@ class DebuggerServer:
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
if metadata_stream.state == ServerStatus.PENDING.value:
log.info("The backend is in pending status.")
return metadata_stream.get()
self._validate_leaf_name(node_name)
res = self._get_tensor_history(node_name)
return metadata_stream.get(['state', 'step'])
res = self._get_tensor_history(node_name, graph_name)
return res

def _validate_leaf_name(self, node_name):
"""Validate if the node is a leaf node."""
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
node_type = graph_stream.get_node_type(node_name)
if is_scope_type(node_type):
log.error("Scope type node has no tensor history.")
raise DebuggerParamValueError("Invalid leaf node name.")

def _get_tensor_history(self, node_name):
def _get_tensor_history(self, node_name, graph_name=None):
"""
Get tensor history for single node.

Args:
node_name (str): The name of leaf node.
graph_name (str): The graph name. Default: None.

Returns:
dict, the tensor history and metadata.
"""
# get basic tensor history
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
tensor_history = graph_stream.get_tensor_history(node_name)
tensor_history = graph_stream.get_tensor_history(node_name, graph_name)
# add tensor value for tensor history
self._add_tensor_value_for_tensor_history(tensor_history, node_name)
# add hit label for tensor history
watchpoint_hit_stream = self.cache_store.get_stream_handler(Streams.WATCHPOINT_HIT)
watchpoint_hit_stream.update_tensor_history(tensor_history)
# add metadata
metadata = self.cache_store.get_stream_handler(Streams.METADATA).get()
metadata = self.cache_store.get_stream_handler(Streams.METADATA).get(['state', 'step'])
tensor_history.update(metadata)
return tensor_history

@@ -325,28 +352,30 @@ class DebuggerServer:
self.cache_store.put_command({'view_cmd': view_cmd, 'node_name': node_name})
log.debug("Send view cmd.")

def retrieve_tensor_value(self, name, detail, shape):
def retrieve_tensor_value(self, name, detail, shape, graph_name=None, prev=False):
"""Retrieve the tensor value."""
log.info("Retrieve tensor value: name: %s, detail: %s, shape: %s", name, detail, shape)
self.validate_tensor_param(name, detail)
# Limit to query max two dimensions for tensor in table view.
parsed_shape = TensorUtils.parse_shape(shape, limit=MAX_DIMENSIONS_FOR_TENSOR)
node_type, tensor_name = self._get_tensor_name_and_type_by_ui_name(name)
node_type, tensor_name = self._get_tensor_name_and_type_by_ui_name(name, graph_name)
reply = self.cache_store.get_stream_handler(Streams.TENSOR).get(
{'name': tensor_name,
'node_type': node_type,
'shape': parsed_shape}
'shape': parsed_shape,
'prev': prev}
)
reply['tensor_value']['name'] = name

return reply

def _get_tensor_name_and_type_by_ui_name(self, name):
def _get_tensor_name_and_type_by_ui_name(self, name, graph_name=None):
"""
Get inner tensor name and type by UI name.

Args:
name (str): Node name shown in UI.
graph_name (Union[str, None]): The graph name, default is: None.

Returns:
str, full name of tensor.
@@ -354,8 +383,9 @@ class DebuggerServer:
"""
node_name, slot = name.rsplit(':', 1)
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
node_type = graph_stream.get_node_type(node_name)
full_name = graph_stream.get_full_name(node_name)
graph_name = graph_name if graph_name else graph_stream.get_graph_id_by_name(node_name)
node_type = graph_stream.get_node_type(node_name, graph_name)
full_name = graph_stream.get_full_name(node_name, graph_name)
tensor_name = full_name + ':' + slot
return node_type, tensor_name

@@ -379,9 +409,7 @@ class DebuggerServer:
filter_condition (dict): Filter condition.

- watch_point_id (int): The id of watchpoint. If not given, return all watchpoints.

- name (str): The name of single node.

- single_node (bool): If False, return the sub-layer of single node. If True, return
the node list from root node to single node.

@@ -390,7 +418,7 @@ class DebuggerServer:
"""
watchpoint_id = filter_condition.get('watch_point_id', 0)
if not watchpoint_id:
reply = self.cache_store.get_stream_handler(Streams.WATCHPOINT).get()
reply = self._hide_parameters_for_ui()
log.debug("Get condition of watchpoints.")
else:
reply = self._retrieve_node(filter_condition)
@@ -406,7 +434,6 @@ class DebuggerServer:
filter_condition (dict): Filter condition.

- name (str): The name of single node.

- single_node (bool): If False, return the sub-layer of single node. If True, return
the node list from root node to single node.

@@ -418,34 +445,48 @@ class DebuggerServer:
if node_name is None:
reply = self.cache_store.get_stream_handler(Streams.WATCHPOINT_HIT).get()
return reply
# get tensor history and graph of the hit node.
self._validate_leaf_name(node_name)
# get tensor history
reply = self._get_tensor_history(node_name)
log.debug("Get tensor history for watchpoint hit node.")
# get single graph
if filter_condition.get('single_node'):
graph_name = self.cache_store.get_stream_handler(Streams.GRAPH).validate_graph_name(
filter_condition.get('graph_name'))
filter_condition['graph_name'] = graph_name
graph = self._get_nodes_info(filter_condition)
reply.update(graph)
log.debug("Get tensor history for watchpoint hit node.")

return reply

def create_watchpoint(self, watch_condition, watch_nodes=None, watch_point_id=None):
def create_watchpoint(self, watch_condition, watch_nodes=None, watch_point_id=None, search_pattern=None,
graph_name=None):
"""
Create watchpoint.

Args:
watch_condition (dict): The watch condition.

- condition (str): Accept `INF` or `NAN`.

- param (list[float]): Not defined yet.
watch_condition (dict): The watch condition. The format is like:
{
"id": "tensor_too_large",
"params": [
{
"name": "abs_mean_gt",
"disable": false,
"value": 1.1
}
]
}

- id (str): Id of condition.

- params (list[dict]): The list of param for this condition.
watch_nodes (list[str]): The list of node names.
watch_point_id (int): The id of watchpoint.
search_pattern (dict): The search pattern. Default: None.
graph_name (str): The relative graph_name of the watched node. Default: None.

Returns:
dict, the id of new watchpoint.
dict, the id of new watchpoint and metadata info.
"""
log.info("Received create watchpoint request. WatchCondition: %s", watch_condition)
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
@@ -453,17 +494,28 @@ class DebuggerServer:
log.error("Failed to create watchpoint as the MindSpore is not in waiting state.")
raise DebuggerCreateWatchPointError(
"Failed to create watchpoint as the MindSpore is not in waiting state.")
if metadata_stream.backend == 'GPU' and watch_condition.get('condition') == 'OVERFLOW':
log.error("GPU doesn't support OVERFLOW watch condition.")
raise DebuggerParamValueError("GPU doesn't support OVERFLOW watch condition.")
if metadata_stream.backend == 'GPU' and watch_condition.get('id') in (
ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, ConditionIdEnum.OPERATOR_OVERFLOW.value):
log.error("GPU doesn't support overflow watch condition.")
raise DebuggerParamValueError("GPU doesn't support overflow watch condition.")

if metadata_stream.backend == 'Ascend' and watch_condition.get('id') == ConditionIdEnum.NAN.value:
log.error("Ascend doesn't support nan watch condition.")
raise DebuggerParamValueError("Ascend doesn't support nan watch condition.")

watch_nodes = self._get_node_basic_infos(watch_nodes)
watch_point_id = self.cache_store.get_stream_handler(Streams.WATCHPOINT).create_watchpoint(
watch_condition, watch_nodes, watch_point_id)
watch_nodes = self._get_watch_node_with_basic_info(
node_names=watch_nodes, search_pattern=search_pattern, graph_name=graph_name)
watchpoint_stream = self.cache_store.get_stream_handler(Streams.WATCHPOINT)
watch_point_id = watchpoint_stream.create_watchpoint(
self.condition_mgr, watch_condition, watch_nodes, watch_point_id)
log.info("Create watchpoint %d", watch_point_id)
return {'id': watch_point_id}

def update_watchpoint(self, watch_point_id, watch_nodes, mode, name=None):
metadata_stream.enable_recheck = watchpoint_stream.is_recheckable(metadata_stream.backend)
res = metadata_stream.get(['state', 'enable_recheck'])
res['id'] = watch_point_id
return res

def update_watchpoint(self, watch_point_id, watch_nodes, mode, search_pattern=None, graph_name=None):
"""
Update watchpoint.

@@ -472,13 +524,14 @@ class DebuggerServer:
watch_nodes (list[str]): The list of node names.
mode (int): The update operator on nodes. 0 for remove nodes from watch nodes.
1 for add nodes to watch nodes.
name (str): The search name. Default: None.
search_pattern (dict): The search pattern. Default: None.
graph_name (str): The relative graph_name of the watched node. Default: None.

Returns:
dict, empty response.
dict, the metadata info.
"""
if self.cache_store.get_stream_handler(
Streams.METADATA).state != ServerStatus.WAITING.value:
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
if metadata_stream.state != ServerStatus.WAITING.value:
log.error("Failed to update watchpoint as the MindSpore is not in waiting state.")
raise DebuggerUpdateWatchPointError(
"Failed to update watchpoint as the MindSpore is not in waiting state."
@@ -489,22 +542,40 @@ class DebuggerServer:
if not watch_nodes or not watch_point_id:
log.error("Invalid parameter for update watchpoint.")
raise DebuggerParamValueError("Invalid parameter for update watchpoint.")
# update watch node
if name is not None:
watch_nodes = self._get_watch_nodes_by_search(watch_nodes)
elif mode == 1:
watch_nodes = self._get_node_basic_infos(watch_nodes)

# get node basic info for watch nodes
watch_nodes = self._get_watch_node_with_basic_info(watch_nodes, search_pattern, graph_name)
watchpoint_stream.update_watchpoint(watch_point_id, watch_nodes, mode)
metadata_stream.enable_recheck = watchpoint_stream.is_recheckable(metadata_stream.backend)
log.info("Update watchpoint with id: %d", watch_point_id)
return {}
return metadata_stream.get(['state', 'enable_recheck'])

def _get_watch_nodes_by_search(self, watch_nodes):
def _get_watch_node_with_basic_info(self, node_names, search_pattern=None, graph_name=None):
"""
Get watch node with basic info.

Args:
node_names (list[str]): A list of node names.
search_pattern (dict): Get watch node with search pattern. Default: None
graph_name (str): The relative graph_name of the watched node. Default: None.

Returns:
list[NodeBasicInfo], a list of node basic infos.
"""
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
graph_name = graph_stream.validate_graph_name(graph_name)
if search_pattern is not None:
watch_nodes = self._get_watch_nodes_by_search(node_names, search_pattern, graph_name)
else:
watch_nodes = self._get_node_basic_infos(node_names, graph_name=graph_name)
return watch_nodes

def _get_watch_nodes_by_search(self, watch_nodes, search_pattern, graph_name):
"""Get watched leaf nodes by search name."""
watched_leaf_nodes = []
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
new_pattern = {'graph_name': graph_name}.update(search_pattern)
for search_name in watch_nodes:
search_nodes = graph_stream.get_searched_node_list()
search_nodes = graph_stream.get_searched_node_list(new_pattern)
search_node_names = [
NodeBasicInfo(name=node.name, full_name=node.full_name, type=node.type)
for node in search_nodes
@@ -515,41 +586,48 @@ class DebuggerServer:

return watched_leaf_nodes

def delete_watchpoint(self, watch_point_id):
def delete_watchpoint(self, watch_point_id=None):
"""
Delete watchpoint.

Args:
watch_point_id (int): The id of watchpoint.
watch_point_id (Union[None, int]): The id of watchpoint.
If None, delete all watchpoints. Default: None.

Returns:
dict, empty response.
dict, the metadata info.
"""
if self.cache_store.get_stream_handler(
Streams.METADATA).state != ServerStatus.WAITING.value:
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
if metadata_stream.state != ServerStatus.WAITING.value:
log.error("Failed to delete watchpoint as the MindSpore is not in waiting state.")
raise DebuggerDeleteWatchPointError(
"Failed to delete watchpoint as the MindSpore is not in waiting state."
)
self.cache_store.get_stream_handler(Streams.WATCHPOINT).delete_watchpoint(watch_point_id)
log.info("Delete watchpoint with id: %d", watch_point_id)
return {}
watchpoint_stream = self.cache_store.get_stream_handler(Streams.WATCHPOINT)
watchpoint_stream.delete_watchpoint(watch_point_id)
metadata_stream.enable_recheck = watchpoint_stream.is_recheckable()
log.info("Delete watchpoint with id: %s", watch_point_id)
return metadata_stream.get(['state', 'enable_recheck'])

def _get_node_basic_infos(self, node_names, graph_name=None):
"""
Get node info according to node names.

def _get_node_basic_infos(self, node_names):
"""Get node info according to node names."""
Args:
node_names (list[str]): A list of node names.
graph_name (str): The relative graph_name of the watched node. Default: None.

Returns:
list[NodeBasicInfo], a list of basic node infos.
"""
if not node_names:
return []
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
node_infos = []
for node_name in node_names:
node_type = graph_stream.get_node_type(node_name)
if node_type == NodeTypeEnum.AGGREGATION_SCOPE.value:
sub_nodes = graph_stream.get_nodes_by_scope(node_name)
sub_infos = [NodeBasicInfo(name=node.name, full_name=node.full_name, type=node.type)
for node in sub_nodes]
node_infos.extend(sub_infos)
full_name = graph_stream.get_full_name(node_name)
node_infos.append(NodeBasicInfo(name=node_name, full_name=full_name, type=node_type))
node_info = graph_stream.get_node_basic_info(node_name, graph_name)
node_infos.append(node_info)

return node_infos

def control(self, params=None):
@@ -561,14 +639,12 @@ class DebuggerServer:

- mode (str): Acceptable control command, including `continue`,
`pause` and `terminate`.

- level (str): The control granularity, `node` level or `step` level.
Default: `step`.

- steps (int): Specify the steps that training should run.
Used when `level` is `step`.

- name (str): Specify the name of the node. Used when `level` is `node`.
- graph_name (str): The graph name.

Returns:
dict, the response.
@@ -597,6 +673,9 @@ class DebuggerServer:
Args:
metadata_stream (MetadataHandler): The metadata_handler
params (dict): The control params.

Returns:
dict, metadata info.
"""
if metadata_stream.state != ServerStatus.WAITING.value:
log.error("MindSpore is not ready to run. Current state is: %s", metadata_stream.state)
@@ -604,7 +683,6 @@ class DebuggerServer:
"MindSpore is not ready to run or is running currently."
)
metadata_stream.state = ServerStatus.RUNNING.value
current_state = ServerStatus.RUNNING.value
try:
event = self._construct_run_event(params)
self._send_watchpoints()
@@ -612,13 +690,12 @@ class DebuggerServer:
except MindInsightException as err:
log.error("Failed to send run event.")
log.exception(err)
current_state = ServerStatus.WAITING.value
metadata_stream.state = current_state
metadata_stream.state = ServerStatus.WAITING.value
raise DebuggerContinueError("Failed to send run command.")
else:
metadata_stream.enable_recheck = False
log.debug("Send the RunCMD to command queue.")

return {'metadata': {'state': current_state}}
return metadata_stream.get(['state', 'enable_recheck'])

def _construct_run_event(self, params):
"""
@@ -627,18 +704,22 @@ class DebuggerServer:
Args:
params (dict): The control params.

- level (str): The control granularity, `node` level or `step` level.
- level (str): The control granularity, `node`, `step` or `recheck` level.
Default: `step`.

- steps (int): Specify the steps that training should run.
Used when `level` is `step`.

- name (str): Specify the name of the node. Used when `level` is `node`.
- graph_name (str): The graph name.

Returns:
EventReply, control event with run command.
"""
level = params.get('level', 'step')
# validate level
if level not in [RunLevel.NODE.value, RunLevel.STEP.value, RunLevel.RECHECK.value]:
log.error("Invalid Value. `level` should be `step`, `node` or `recheck`. Got %s", level)
raise DebuggerParamValueError("level` should be `step`, `node` or `recheck`.")
# construct run command events
event = get_ack_reply()
if level == 'step':
steps = params.get('steps')
@@ -646,31 +727,37 @@ class DebuggerServer:
steps = 1
run_cmd = RunCMD(run_level='step', run_steps=steps)
elif level == 'node':
name = params.get('name')
name = params.get('name', '')
graph_name = params.get('graph_name')
if name:
self._validate_leaf_name(name)
name = self.cache_store.get_stream_handler(Streams.GRAPH).get_full_name(name)
else:
name = ''
self._validate_leaf_name(name, graph_name)
name = self.cache_store.get_stream_handler(Streams.GRAPH).get_full_name(name, graph_name)
run_cmd = RunCMD(run_level='node', node_name=name)
else:
log.error("Invalid Value. `level` should be `step` or `node`. Got %s", level)
raise DebuggerParamValueError("level` should be `step` or `node`")
run_cmd = RunCMD(run_level='recheck')

event.run_cmd.CopyFrom(run_cmd)
log.debug("Construct run event. %s", event)
return event

def _validate_leaf_name(self, node_name, graph_name):
"""Validate if the node is a leaf node."""
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
node_type = graph_stream.get_node_type(node_name, graph_name)
if is_scope_type(node_type):
log.error("Scope type node has no tensor history.")
raise DebuggerParamValueError("Invalid leaf node name.")

def _send_watchpoints(self):
"""Set watchpoints."""
watchpoint_stream = self.cache_store.get_stream_handler(Streams.WATCHPOINT)
watchpoints = watchpoint_stream.get(filter_condition=True).get('watch_points')
if watchpoints:
for watchpoint in watchpoints:
set_commands = watchpoint_stream.get_pending_commands(self.cache_store.get_stream_handler(Streams.GRAPH))
if set_commands:
for set_cmd in set_commands:
event = get_ack_reply()
event.set_cmd.CopyFrom(watchpoint)
event.set_cmd.CopyFrom(set_cmd)
self.cache_store.put_command(event)
watchpoint_stream.sync_set_cmd()
watchpoint_stream.sync_set_cmd(set_commands)
log.debug("Send SetCMD to MindSpore. %s", event)

def _pause(self, metadata_stream):
@@ -679,6 +766,9 @@ class DebuggerServer:

Args:
metadata_stream (MetadataHandler): The metadata stream handler.

Returns:
dict, metadata info.
"""
if metadata_stream.state != ServerStatus.RUNNING.value:
log.error("The MindSpore is not running.")
@@ -687,8 +777,9 @@ class DebuggerServer:
event = get_ack_reply()
event.run_cmd.CopyFrom(RunCMD(run_level='step', run_steps=0))
self.cache_store.put_command(event)
metadata_stream.enable_recheck = False
log.debug("Send the Pause command")
return {'metadata': {'state': 'waiting'}}
return metadata_stream.get(['state', 'enable_recheck'])

def _terminate(self, metadata_stream):
"""
@@ -696,21 +787,27 @@ class DebuggerServer:

Args:
metadata_stream (MetadataHandler): The metadata stream handler.

Returns:
dict, metadata info.
"""
metadata_stream.state = 'pending'
self.cache_store.clean_data()
self.cache_store.clean_command()
event = get_ack_reply()
event.exit = True
self.cache_store.put_command(event)
metadata_stream.enable_recheck = False
log.debug("Send the ExitCMD.")
return {'metadata': {'state': 'pending'}}
return metadata_stream.get(['state', 'enable_recheck'])

def retrieve_node_by_bfs(self, node_name, ascend=False):
def retrieve_node_by_bfs(self, node_name, graph_name=None, ascend=False):
"""
Get the graph of the next node according to node_name.

Args:
node_name (str): The name of current chosen leaf node.
graph_name (str): The graph name.
ascend (bool): If True, traverse the input nodes;
If False, traverse the output nodes. Default is True.

@@ -721,6 +818,7 @@ class DebuggerServer:
node_name, ascend)
reply = {}
graph_stream = self.cache_store.get_stream_handler(Streams.GRAPH)
graph_name = graph_stream.validate_graph_name(graph_name)
next_node_name = graph_stream.get_node_by_bfs_order(node_name, ascend)
# no next node
if next_node_name is None:
@@ -728,6 +826,7 @@ class DebuggerServer:
# add graph and tensor history for next node
filter_condition = {
'name': next_node_name,
'graph_name': graph_name,
'single_node': True
}
search_graph = self._get_nodes_info(filter_condition)
@@ -735,3 +834,85 @@ class DebuggerServer:
reply.update(search_graph)

return reply

def recheck(self):
"""
Recheck all watchpoints.

Returns:
dict, metadata info.
"""
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
# validate backend status is able to recheck watchpoint
if not metadata_stream.enable_recheck:
log.error("Recheck is not available.")
raise DebuggerRecheckError("Recheck is not available.")
metadata_stream.state = ServerStatus.RUNNING.value
metadata_stream.enable_recheck = False
# send updated watchpoint and recheck command
try:
event = self._construct_run_event({'level': 'recheck'})
self._send_watchpoints()
self.cache_store.put_command(event)
except MindInsightException as err:
log.error("Failed to send recheck event.")
log.exception(err)
metadata_stream.state = ServerStatus.WAITING.value
metadata_stream.enable_recheck = True
raise DebuggerContinueError("Failed to send run command.")
else:
log.debug("Send the recheck to command queue.")
return metadata_stream.get(['state', 'enable_recheck'])

def retrieve_tensor_graph(self, tensor_name, graph_name):
"""
Retrieve tensor graph.

Args:
tensor_name (str): The tensor name from UI.
graph_name (str): The graph name.

Returns:
dict, tensor graph object.
"""
log.info("Retrieve tensor graph for %s from %s", tensor_name, graph_name)
tensor_graph_ops = TensorDetailInfo(self.cache_store).get_tensor_graph(tensor_name, graph_name)
return tensor_graph_ops

def retrieve_tensor_hits(self, tensor_name, graph_name):
"""
Retrieve tensor hit information.

Args:
tensor_name (str): The tensor name from UI.
graph_name (str): The graph name.

Returns:
dict, tensor hit info.
"""
log.info("Retrieve tensor hits for %s from %s", tensor_name, graph_name)
watch_points = TensorDetailInfo(self.cache_store).get_tensor_watch_points(tensor_name, graph_name)
return {'watch_points': watch_points}

def _hide_parameters_for_ui(self):
"""
Hide some parameters on ui.

Returns:
dict, watch point list.
"""
reply = self.cache_store.get_stream_handler(Streams.WATCHPOINT).get()
watch_points = reply.get('watch_points')
for i, watch_point in enumerate(watch_points):
watch_condition = watch_point.get('watch_condition')
parameters = watch_condition.get('params')
watch_condition_id = watch_condition.get('id')
mgr_condition = self.condition_mgr.get_condition(watch_condition_id)
ui_watch_condition = []
for param in parameters:
parameter_definition = mgr_condition.get_parameter_definition(param['name'])
if not parameter_definition.visible_on_ui:
continue
ui_watch_condition.append(param)
reply['watch_points'][i]['watch_condition']['params'] = ui_watch_condition
return reply

+ 26
- 5
mindinsight/debugger/proto/debug_grpc.proto View File

@@ -27,6 +27,7 @@ service EventListener {
rpc SendGraph (stream Chunk) returns (EventReply) {};
rpc SendTensors (stream TensorProto) returns (EventReply) {};
rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
rpc SendMultiGraphs (stream Chunk) returns (EventReply) {};
}

message Metadata {
@@ -38,11 +39,15 @@ message Metadata {
string cur_node = 4;
// check if training is done.
bool training_done = 5;
// the number of total graphs
int32 graph_num = 6;
}

message Chunk {
bytes buffer = 1;
bool finished = 2;
}

message EventReply {
enum Status {
OK = 0;
@@ -61,13 +66,11 @@ message EventReply {
}

message RunCMD {
// running level. 'step' or 'node'
// step level or node level. "step", "node" or "recheck".
string run_level = 1;

oneof cmd {
int32 run_steps = 2;

// the full name of next node
// the next node full name
string node_name = 3;
}
}
@@ -96,9 +99,27 @@ message WatchCondition {
max_min_lt = 8;
mean_gt = 9;
mean_lt = 10;
sd_gt = 11;
sd_lt = 12;
tensor_general_overflow = 13;
tensor_initialization = 14;
tensor_too_large = 15;
tensor_too_small = 16;
tensor_all_zero = 17;
tensor_change_too_large = 18;
tensor_change_too_small = 19;
tensor_not_changed = 20;
}
Condition condition = 1;
float value = 2; // for between condition, there will be two values
float value = 2;
message Parameter {
string name = 1;
bool disabled = 2;
double value = 3;
bool hit = 4; // Whether this parameter is hit when checking tensor.
}
// The ID 3 has been used on the mindspore side repeated bool include=3, so skip 3 for backward compatibility.
repeated Parameter params = 4;
}

message WatchNode {


+ 156
- 25
mindinsight/debugger/proto/debug_grpc_pb2.py View File

@@ -19,7 +19,7 @@ DESCRIPTOR = _descriptor.FileDescriptor(
package='debugger',
syntax='proto3',
serialized_options=None,
serialized_pb=b'\n+mindinsight/debugger/proto/debug_grpc.proto\x12\x08\x64\x65\x62ugger\x1a)mindinsight/debugger/proto/ms_graph.proto\"k\n\x08Metadata\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x10\n\x08\x63ur_step\x18\x02 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x03 \x01(\t\x12\x10\n\x08\x63ur_node\x18\x04 \x01(\t\x12\x15\n\rtraining_done\x18\x05 \x01(\x08\"\x17\n\x05\x43hunk\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\"\xec\x01\n\nEventReply\x12+\n\x06status\x18\x01 \x01(\x0e\x32\x1b.debugger.EventReply.Status\x12\x0e\n\x04\x65xit\x18\x02 \x01(\x08H\x00\x12#\n\x07run_cmd\x18\x03 \x01(\x0b\x32\x10.debugger.RunCMDH\x00\x12#\n\x07set_cmd\x18\x04 \x01(\x0b\x32\x10.debugger.SetCMDH\x00\x12%\n\x08view_cmd\x18\x05 \x01(\x0b\x32\x11.debugger.ViewCMDH\x00\")\n\x06Status\x12\x06\n\x02OK\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x05\n\x03\x63md\"L\n\x06RunCMD\x12\x11\n\trun_level\x18\x01 \x01(\t\x12\x13\n\trun_steps\x18\x02 \x01(\x05H\x00\x12\x13\n\tnode_name\x18\x03 \x01(\tH\x00\x42\x05\n\x03\x63md\"\x81\x01\n\x06SetCMD\x12(\n\x0bwatch_nodes\x18\x01 \x03(\x0b\x32\x13.debugger.WatchNode\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\x0e\n\x06\x64\x65lete\x18\x03 \x01(\x08\x12\n\n\x02id\x18\x04 \x01(\x05\"1\n\x07ViewCMD\x12&\n\x07tensors\x18\x01 \x03(\x0b\x32\x15.debugger.TensorProto\"\xee\x01\n\x0eWatchCondition\x12\x35\n\tcondition\x18\x01 \x01(\x0e\x32\".debugger.WatchCondition.Condition\x12\r\n\x05value\x18\x02 \x01(\x02\"\x95\x01\n\tCondition\x12\x07\n\x03nan\x10\x00\x12\x07\n\x03inf\x10\x01\x12\x0c\n\x08overflow\x10\x02\x12\n\n\x06max_gt\x10\x03\x12\n\n\x06max_lt\x10\x04\x12\n\n\x06min_gt\x10\x05\x12\n\n\x06min_lt\x10\x06\x12\x0e\n\nmax_min_gt\x10\x07\x12\x0e\n\nmax_min_lt\x10\x08\x12\x0b\n\x07mean_gt\x10\t\x12\x0b\n\x07mean_lt\x10\n\"1\n\tWatchNode\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x11\n\tnode_type\x18\x02 \x01(\t\"u\n\rWatchpointHit\x12%\n\x06tensor\x18\x01 \x01(\x0b\x32\x15.debugger.TensorProto\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\n\n\x02id\x18\x03 \x01(\x05\x32\xc3\x02\n\rEventListener\x12\x35\n\x07WaitCMD\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12:\n\x0cSendMetadata\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12\x36\n\tSendGraph\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x12>\n\x0bSendTensors\x12\x15.debugger.TensorProto\x1a\x14.debugger.EventReply\"\x00(\x01\x12G\n\x12SendWatchpointHits\x12\x17.debugger.WatchpointHit\x1a\x14.debugger.EventReply\"\x00(\x01\x62\x06proto3'
serialized_pb=b'\n+mindinsight/debugger/proto/debug_grpc.proto\x12\x08\x64\x65\x62ugger\x1a)mindinsight/debugger/proto/ms_graph.proto\"~\n\x08Metadata\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x10\n\x08\x63ur_step\x18\x02 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x03 \x01(\t\x12\x10\n\x08\x63ur_node\x18\x04 \x01(\t\x12\x15\n\rtraining_done\x18\x05 \x01(\x08\x12\x11\n\tgraph_num\x18\x06 \x01(\x05\")\n\x05\x43hunk\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\x10\n\x08\x66inished\x18\x02 \x01(\x08\"\xec\x01\n\nEventReply\x12+\n\x06status\x18\x01 \x01(\x0e\x32\x1b.debugger.EventReply.Status\x12\x0e\n\x04\x65xit\x18\x02 \x01(\x08H\x00\x12#\n\x07run_cmd\x18\x03 \x01(\x0b\x32\x10.debugger.RunCMDH\x00\x12#\n\x07set_cmd\x18\x04 \x01(\x0b\x32\x10.debugger.SetCMDH\x00\x12%\n\x08view_cmd\x18\x05 \x01(\x0b\x32\x11.debugger.ViewCMDH\x00\")\n\x06Status\x12\x06\n\x02OK\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x05\n\x03\x63md\"L\n\x06RunCMD\x12\x11\n\trun_level\x18\x01 \x01(\t\x12\x13\n\trun_steps\x18\x02 \x01(\x05H\x00\x12\x13\n\tnode_name\x18\x03 \x01(\tH\x00\x42\x05\n\x03\x63md\"\x81\x01\n\x06SetCMD\x12(\n\x0bwatch_nodes\x18\x01 \x03(\x0b\x32\x13.debugger.WatchNode\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\x0e\n\x06\x64\x65lete\x18\x03 \x01(\x08\x12\n\n\x02id\x18\x04 \x01(\x05\"1\n\x07ViewCMD\x12&\n\x07tensors\x18\x01 \x03(\x0b\x32\x15.debugger.TensorProto\"\xcc\x04\n\x0eWatchCondition\x12\x35\n\tcondition\x18\x01 \x01(\x0e\x32\".debugger.WatchCondition.Condition\x12\r\n\x05value\x18\x02 \x01(\x02\x12\x32\n\x06params\x18\x04 \x03(\x0b\x32\".debugger.WatchCondition.Parameter\x1aG\n\tParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64isabled\x18\x02 \x01(\x08\x12\r\n\x05value\x18\x03 \x01(\x01\x12\x0b\n\x03hit\x18\x04 \x01(\x08\"\xf6\x02\n\tCondition\x12\x07\n\x03nan\x10\x00\x12\x07\n\x03inf\x10\x01\x12\x0c\n\x08overflow\x10\x02\x12\n\n\x06max_gt\x10\x03\x12\n\n\x06max_lt\x10\x04\x12\n\n\x06min_gt\x10\x05\x12\n\n\x06min_lt\x10\x06\x12\x0e\n\nmax_min_gt\x10\x07\x12\x0e\n\nmax_min_lt\x10\x08\x12\x0b\n\x07mean_gt\x10\t\x12\x0b\n\x07mean_lt\x10\n\x12\t\n\x05sd_gt\x10\x0b\x12\t\n\x05sd_lt\x10\x0c\x12\x1b\n\x17tensor_general_overflow\x10\r\x12\x19\n\x15tensor_initialization\x10\x0e\x12\x14\n\x10tensor_too_large\x10\x0f\x12\x14\n\x10tensor_too_small\x10\x10\x12\x13\n\x0ftensor_all_zero\x10\x11\x12\x1b\n\x17tensor_change_too_large\x10\x12\x12\x1b\n\x17tensor_change_too_small\x10\x13\x12\x16\n\x12tensor_not_changed\x10\x14\"1\n\tWatchNode\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x11\n\tnode_type\x18\x02 \x01(\t\"u\n\rWatchpointHit\x12%\n\x06tensor\x18\x01 \x01(\x0b\x32\x15.debugger.TensorProto\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\n\n\x02id\x18\x03 \x01(\x05\x32\x81\x03\n\rEventListener\x12\x35\n\x07WaitCMD\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12:\n\x0cSendMetadata\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12\x36\n\tSendGraph\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x12>\n\x0bSendTensors\x12\x15.debugger.TensorProto\x1a\x14.debugger.EventReply\"\x00(\x01\x12G\n\x12SendWatchpointHits\x12\x17.debugger.WatchpointHit\x1a\x14.debugger.EventReply\"\x00(\x01\x12<\n\x0fSendMultiGraphs\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x62\x06proto3'
,
dependencies=[mindinsight_dot_debugger_dot_proto_dot_ms__graph__pb2.DESCRIPTOR,])

@@ -46,8 +46,8 @@ _EVENTREPLY_STATUS = _descriptor.EnumDescriptor(
],
containing_type=None,
serialized_options=None,
serialized_start=423,
serialized_end=464,
serialized_start=460,
serialized_end=501,
)
_sym_db.RegisterEnumDescriptor(_EVENTREPLY_STATUS)

@@ -101,11 +101,51 @@ _WATCHCONDITION_CONDITION = _descriptor.EnumDescriptor(
name='mean_lt', index=10, number=10,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='sd_gt', index=11, number=11,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='sd_lt', index=12, number=12,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_general_overflow', index=13, number=13,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_initialization', index=14, number=14,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_too_large', index=15, number=15,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_too_small', index=16, number=16,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_all_zero', index=17, number=17,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_change_too_large', index=18, number=18,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_change_too_small', index=19, number=19,
serialized_options=None,
type=None),
_descriptor.EnumValueDescriptor(
name='tensor_not_changed', index=20, number=20,
serialized_options=None,
type=None),
],
containing_type=None,
serialized_options=None,
serialized_start=824,
serialized_end=973,
serialized_start=986,
serialized_end=1360,
)
_sym_db.RegisterEnumDescriptor(_WATCHCONDITION_CONDITION)

@@ -152,6 +192,13 @@ _METADATA = _descriptor.Descriptor(
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='graph_num', full_name='debugger.Metadata.graph_num', index=5,
number=6, type=5, cpp_type=1, label=1,
has_default_value=False, default_value=0,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
],
extensions=[
],
@@ -165,7 +212,7 @@ _METADATA = _descriptor.Descriptor(
oneofs=[
],
serialized_start=100,
serialized_end=207,
serialized_end=226,
)


@@ -183,6 +230,13 @@ _CHUNK = _descriptor.Descriptor(
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='finished', full_name='debugger.Chunk.finished', index=1,
number=2, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
],
extensions=[
],
@@ -195,8 +249,8 @@ _CHUNK = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=209,
serialized_end=232,
serialized_start=228,
serialized_end=269,
)


@@ -258,8 +312,8 @@ _EVENTREPLY = _descriptor.Descriptor(
name='cmd', full_name='debugger.EventReply.cmd',
index=0, containing_type=None, fields=[]),
],
serialized_start=235,
serialized_end=471,
serialized_start=272,
serialized_end=508,
)


@@ -306,8 +360,8 @@ _RUNCMD = _descriptor.Descriptor(
name='cmd', full_name='debugger.RunCMD.cmd',
index=0, containing_type=None, fields=[]),
],
serialized_start=473,
serialized_end=549,
serialized_start=510,
serialized_end=586,
)


@@ -358,8 +412,8 @@ _SETCMD = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=552,
serialized_end=681,
serialized_start=589,
serialized_end=718,
)


@@ -389,11 +443,62 @@ _VIEWCMD = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=683,
serialized_end=732,
serialized_start=720,
serialized_end=769,
)


_WATCHCONDITION_PARAMETER = _descriptor.Descriptor(
name='Parameter',
full_name='debugger.WatchCondition.Parameter',
filename=None,
file=DESCRIPTOR,
containing_type=None,
fields=[
_descriptor.FieldDescriptor(
name='name', full_name='debugger.WatchCondition.Parameter.name', index=0,
number=1, type=9, cpp_type=9, label=1,
has_default_value=False, default_value=b"".decode('utf-8'),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='disabled', full_name='debugger.WatchCondition.Parameter.disabled', index=1,
number=2, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='value', full_name='debugger.WatchCondition.Parameter.value', index=2,
number=3, type=1, cpp_type=5, label=1,
has_default_value=False, default_value=float(0),
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='hit', full_name='debugger.WatchCondition.Parameter.hit', index=3,
number=4, type=8, cpp_type=7, label=1,
has_default_value=False, default_value=False,
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
enum_types=[
],
serialized_options=None,
is_extendable=False,
syntax='proto3',
extension_ranges=[],
oneofs=[
],
serialized_start=912,
serialized_end=983,
)

_WATCHCONDITION = _descriptor.Descriptor(
name='WatchCondition',
full_name='debugger.WatchCondition',
@@ -415,10 +520,17 @@ _WATCHCONDITION = _descriptor.Descriptor(
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
_descriptor.FieldDescriptor(
name='params', full_name='debugger.WatchCondition.params', index=2,
number=4, type=11, cpp_type=10, label=3,
has_default_value=False, default_value=[],
message_type=None, enum_type=None, containing_type=None,
is_extension=False, extension_scope=None,
serialized_options=None, file=DESCRIPTOR),
],
extensions=[
],
nested_types=[],
nested_types=[_WATCHCONDITION_PARAMETER, ],
enum_types=[
_WATCHCONDITION_CONDITION,
],
@@ -428,8 +540,8 @@ _WATCHCONDITION = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=735,
serialized_end=973,
serialized_start=772,
serialized_end=1360,
)


@@ -466,8 +578,8 @@ _WATCHNODE = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=975,
serialized_end=1024,
serialized_start=1362,
serialized_end=1411,
)


@@ -511,8 +623,8 @@ _WATCHPOINTHIT = _descriptor.Descriptor(
extension_ranges=[],
oneofs=[
],
serialized_start=1026,
serialized_end=1143,
serialized_start=1413,
serialized_end=1530,
)

_EVENTREPLY.fields_by_name['status'].enum_type = _EVENTREPLY_STATUS
@@ -541,7 +653,9 @@ _RUNCMD.fields_by_name['node_name'].containing_oneof = _RUNCMD.oneofs_by_name['c
_SETCMD.fields_by_name['watch_nodes'].message_type = _WATCHNODE
_SETCMD.fields_by_name['watch_condition'].message_type = _WATCHCONDITION
_VIEWCMD.fields_by_name['tensors'].message_type = mindinsight_dot_debugger_dot_proto_dot_ms__graph__pb2._TENSORPROTO
_WATCHCONDITION_PARAMETER.containing_type = _WATCHCONDITION
_WATCHCONDITION.fields_by_name['condition'].enum_type = _WATCHCONDITION_CONDITION
_WATCHCONDITION.fields_by_name['params'].message_type = _WATCHCONDITION_PARAMETER
_WATCHCONDITION_CONDITION.containing_type = _WATCHCONDITION
_WATCHPOINTHIT.fields_by_name['tensor'].message_type = mindinsight_dot_debugger_dot_proto_dot_ms__graph__pb2._TENSORPROTO
_WATCHPOINTHIT.fields_by_name['watch_condition'].message_type = _WATCHCONDITION
@@ -599,11 +713,19 @@ ViewCMD = _reflection.GeneratedProtocolMessageType('ViewCMD', (_message.Message,
_sym_db.RegisterMessage(ViewCMD)

WatchCondition = _reflection.GeneratedProtocolMessageType('WatchCondition', (_message.Message,), {

'Parameter' : _reflection.GeneratedProtocolMessageType('Parameter', (_message.Message,), {
'DESCRIPTOR' : _WATCHCONDITION_PARAMETER,
'__module__' : 'mindinsight.debugger.proto.debug_grpc_pb2'
# @@protoc_insertion_point(class_scope:debugger.WatchCondition.Parameter)
})
,
'DESCRIPTOR' : _WATCHCONDITION,
'__module__' : 'mindinsight.debugger.proto.debug_grpc_pb2'
# @@protoc_insertion_point(class_scope:debugger.WatchCondition)
})
_sym_db.RegisterMessage(WatchCondition)
_sym_db.RegisterMessage(WatchCondition.Parameter)

WatchNode = _reflection.GeneratedProtocolMessageType('WatchNode', (_message.Message,), {
'DESCRIPTOR' : _WATCHNODE,
@@ -627,8 +749,8 @@ _EVENTLISTENER = _descriptor.ServiceDescriptor(
file=DESCRIPTOR,
index=0,
serialized_options=None,
serialized_start=1146,
serialized_end=1469,
serialized_start=1533,
serialized_end=1918,
methods=[
_descriptor.MethodDescriptor(
name='WaitCMD',
@@ -675,6 +797,15 @@ _EVENTLISTENER = _descriptor.ServiceDescriptor(
output_type=_EVENTREPLY,
serialized_options=None,
),
_descriptor.MethodDescriptor(
name='SendMultiGraphs',
full_name='debugger.EventListener.SendMultiGraphs',
index=5,
containing_service=None,
input_type=_CHUNK,
output_type=_EVENTREPLY,
serialized_options=None,
),
])
_sym_db.RegisterServiceDescriptor(_EVENTLISTENER)



+ 52
- 13
mindinsight/debugger/proto/debug_grpc_pb2_grpc.py View File

@@ -1,4 +1,5 @@
# Generated by the gRPC Python protocol compiler plugin. DO NOT EDIT!
"""Client and server classes corresponding to protobuf-defined services."""
import grpc

from mindinsight.debugger.proto import debug_grpc_pb2 as mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2
@@ -6,7 +7,7 @@ from mindinsight.debugger.proto import ms_graph_pb2 as mindinsight_dot_debugger_


class EventListenerStub(object):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""

def __init__(self, channel):
"""Constructor.
@@ -39,37 +40,48 @@ class EventListenerStub(object):
request_serializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.WatchpointHit.SerializeToString,
response_deserializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
)
self.SendMultiGraphs = channel.stream_unary(
'/debugger.EventListener/SendMultiGraphs',
request_serializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.Chunk.SerializeToString,
response_deserializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
)


class EventListenerServicer(object):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""

def WaitCMD(self, request, context):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')

def SendMetadata(self, request, context):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')

def SendGraph(self, request_iterator, context):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')

def SendTensors(self, request_iterator, context):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')

def SendWatchpointHits(self, request_iterator, context):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')

def SendMultiGraphs(self, request_iterator, context):
"""Missing associated documentation comment in .proto file."""
context.set_code(grpc.StatusCode.UNIMPLEMENTED)
context.set_details('Method not implemented!')
raise NotImplementedError('Method not implemented!')
@@ -102,6 +114,11 @@ def add_EventListenerServicer_to_server(servicer, server):
request_deserializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.WatchpointHit.FromString,
response_serializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.SerializeToString,
),
'SendMultiGraphs': grpc.stream_unary_rpc_method_handler(
servicer.SendMultiGraphs,
request_deserializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.Chunk.FromString,
response_serializer=mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.SerializeToString,
),
}
generic_handler = grpc.method_handlers_generic_handler(
'debugger.EventListener', rpc_method_handlers)
@@ -110,7 +127,7 @@ def add_EventListenerServicer_to_server(servicer, server):

# This class is part of an EXPERIMENTAL API.
class EventListener(object):
"""Missing associated documentation comment in .proto file"""
"""Missing associated documentation comment in .proto file."""

@staticmethod
def WaitCMD(request,
@@ -118,6 +135,7 @@ class EventListener(object):
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
@@ -126,7 +144,7 @@ class EventListener(object):
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.Metadata.SerializeToString,
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
options, channel_credentials,
call_credentials, compression, wait_for_ready, timeout, metadata)
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

@staticmethod
def SendMetadata(request,
@@ -134,6 +152,7 @@ class EventListener(object):
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
@@ -142,7 +161,7 @@ class EventListener(object):
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.Metadata.SerializeToString,
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
options, channel_credentials,
call_credentials, compression, wait_for_ready, timeout, metadata)
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

@staticmethod
def SendGraph(request_iterator,
@@ -150,6 +169,7 @@ class EventListener(object):
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
@@ -158,7 +178,7 @@ class EventListener(object):
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.Chunk.SerializeToString,
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
options, channel_credentials,
call_credentials, compression, wait_for_ready, timeout, metadata)
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

@staticmethod
def SendTensors(request_iterator,
@@ -166,6 +186,7 @@ class EventListener(object):
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
@@ -174,7 +195,7 @@ class EventListener(object):
mindinsight_dot_debugger_dot_proto_dot_ms__graph__pb2.TensorProto.SerializeToString,
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
options, channel_credentials,
call_credentials, compression, wait_for_ready, timeout, metadata)
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

@staticmethod
def SendWatchpointHits(request_iterator,
@@ -182,6 +203,7 @@ class EventListener(object):
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
@@ -190,4 +212,21 @@ class EventListener(object):
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.WatchpointHit.SerializeToString,
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
options, channel_credentials,
call_credentials, compression, wait_for_ready, timeout, metadata)
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

@staticmethod
def SendMultiGraphs(request_iterator,
target,
options=(),
channel_credentials=None,
call_credentials=None,
insecure=False,
compression=None,
wait_for_ready=None,
timeout=None,
metadata=None):
return grpc.experimental.stream_unary(request_iterator, target, '/debugger.EventListener/SendMultiGraphs',
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.Chunk.SerializeToString,
mindinsight_dot_debugger_dot_proto_dot_debug__grpc__pb2.EventReply.FromString,
options, channel_credentials,
insecure, call_credentials, compression, wait_for_ready, timeout, metadata)

+ 177
- 7
mindinsight/debugger/stream_cache/debugger_graph.py View File

@@ -14,16 +14,66 @@
# ============================================================================
"""This file is used to define the basic graph."""
from collections import deque
from copy import deepcopy

from mindinsight.datavisual.data_transform.graph.msgraph import MSGraph
from mindinsight.debugger.common.exceptions.exceptions import \
DebuggerNodeNotInGraphError, DebuggerParamValueError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from .node_type_identifier import NodeTypeIdentifier


def _is_match(identifier, node, condition):
"""Check if the node is matched to the identifier.
Args:
identifier (NodeTypeIdentifier): The debug name of the node.
node (Node obj): The number of layers the user wants to trace. Default is 0.

Returns:
list, a list of the traced tensors' name and node type,
arranged in order from leaf node to root node.
int, the number of output tensors.
"""
if condition:
matched = identifier.is_match(node, condition)
else:
matched = identifier.is_match(node)
return matched


class DebuggerGraph(MSGraph):
"""The `DebuggerGraph` object provides interfaces to describe a debugger graph."""

@property
def leaf_nodes(self):
"""Return the leaf nodes."""
return self._leaf_nodes

@property
def normal_node_map(self):
"""Return the normal_node_map"""
return self._normal_node_map

@property
def node_id_map_name(self):
"""Return the node_id_map_name"""
return self._node_id_map_name

@property
def const_node_temp_cache(self):
"""Return const_node_temp_cache"""
return self._const_node_temp_cache

@property
def parameter_node_temp_cache(self):
"""Return parameter_node_temp_cache"""
return self._parameter_node_temp_cache

@property
def full_name_map_name(self):
"""Return full_name_map_name"""
return self._full_name_map_name

def get_node_name_by_full_name(self, full_name):
"""Get node name by full names."""
inner_name = self._full_name_map_name.get(full_name, '')
@@ -33,12 +83,15 @@ class DebuggerGraph(MSGraph):
return inner_name

def get_full_name_by_node_name(self, node_name):
"""Get full name by node name for leaf nodes."""
"""Get full name by node name."""
if not node_name:
return ''
node = self._normal_node_map.get(node_name)
if not node:
log.warning("Node %s is not leaf node.", node_name)
log.error("Node <%s> is not in graph.", node_name)
raise DebuggerNodeNotInGraphError(node_name=node_name)

return node.full_name if node else ''
return node.full_name

def get_node_type(self, node_name):
"""
@@ -48,14 +101,48 @@ class DebuggerGraph(MSGraph):
node_name (str): The full name of the node with its scope.

Returns:
A string, leaf or name_scope.
str, node type or name_scope.
"""
if node_name and not self.exist_node(name=node_name):
if not node_name:
return 'name_scope'
node = self._normal_node_map.get(node_name)
if not node:
log.error("Node <%s> is not in graph.", node_name)
raise DebuggerNodeNotInGraphError(node_name=node_name)

node = self._normal_node_map.get(node_name)
return node.type

def search_nodes_by_category(self, node_category, condition=None):
"""
Search nodes by type.

Args:
node_category (TargetTypeEnum): The node type supported in
mindinsight.conditionmgr.condition.TargetTypeEnum.
condition (dict): Search condition. Default: None.

- activation_func (Union[str, list[str]): The target functions. Used when node_type
is TargetTypeEnum.ACTIVATION.

- search_range (list[Node]): The list of nodes to be searched from.

Returns:
list[Node], list of nodes.
"""
identifier = NodeTypeIdentifier(node_category.value)
# get search range
condition = {} if condition is None else condition
search_range = condition.pop('search_range', None)
if not search_range:
search_range = self._leaf_nodes.values()
# search match nodes
matched_nodes = []
for node in search_range:
matched = _is_match(identifier, node, condition)
if matched:
matched_nodes.append(node)
return matched_nodes

def get_tensor_history(self, node_name, depth=0):
"""
Get the tensor history of a specified node.
@@ -188,3 +275,86 @@ class DebuggerGraph(MSGraph):
raise DebuggerParamValueError(msg)

return default_root

def get_tensor_graph(self, node_name):
"""
Get graph relative to a node.

Args:
node_name (str): Node name.

Returns:
dict, tensor graph, format is:
{'nodes': [
{'name': <node name>,
'full_name': <node full name>,
'type': <node type>
'input': <input objects>,
'output': <output objects>,
'slot': {'id': <slot id>}
}
]}
"""
graph_nodes = []
cur_node = self._leaf_nodes.get(node_name)
node_detail_info = cur_node.to_dict()
cur_node_info = self._get_node_info_for_tensor_graph(cur_node)
cur_node_info['input'] = deepcopy(node_detail_info.get('input'))
cur_node_info['output'] = deepcopy(node_detail_info.get('output'))
self._add_input_node_info(cur_node_info=cur_node_info, graph_nodes=graph_nodes)
self._add_output_node_info(cur_node=cur_node, cur_node_info=cur_node_info, graph_nodes=graph_nodes)
graph_nodes.append(cur_node_info)
return {'nodes': graph_nodes}

@staticmethod
def _get_node_info_for_tensor_graph(node):
"""Get node infos for tensor graph."""
node_info = {
'name': node.name,
'full_name': node.full_name,
'type': node.type,
'input': {},
'output': {},
'slots': [{'slot': str(slot)} for slot in range(node.output_nums)]
}
return node_info

def _add_output_node_info(self, cur_node, cur_node_info, graph_nodes):
"""
Add output node info into cur_node_info and node list.

Args:
cur_node (Node): The current node object.
cur_node_info (dict): Current node info.
graph_nodes (list[<Node info>]): The nodes in tensor graph.
"""
output_slot_mapping = self._get_slot_mapping(cur_node)
for node_name, edge_info in cur_node_info.get('output').items():
edge_info['slot_mapping'] = output_slot_mapping
# add output node info into graph
output_node = self._leaf_nodes.get(node_name)
output_node_info = self._get_node_info_for_tensor_graph(output_node)
output_node_info['input'][cur_node.name] = edge_info
graph_nodes.append(output_node_info)

def _add_input_node_info(self, cur_node_info, graph_nodes):
"""
Add input node info into cur_node_info and node list.

Args:
cur_node_info (dict): Current node info.
graph_nodes (list[<Node info>]): The nodes in tensor graph.
"""
cur_node_name = cur_node_info.get('name')
for node_name, edge_info in cur_node_info.get('input').items():
input_node = self._leaf_nodes.get(node_name)
edge_info['slot_mapping'] = self._get_slot_mapping(input_node)
# add input node info into graph
input_node_info = self._get_node_info_for_tensor_graph(input_node)
input_node_info['output'][cur_node_name] = edge_info
graph_nodes.append(input_node_info)

@staticmethod
def _get_slot_mapping(input_node):
"""Get slot mapping between nodes."""
return [[str(slot), ''] for slot in range(input_node.output_nums)]

+ 81
- 0
mindinsight/debugger/stream_cache/debugger_multigraph.py View File

@@ -0,0 +1,81 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""This file is used to define the basic graph."""
import copy
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.datavisual.data_transform.graph.node import Node, NodeTypeEnum
from .debugger_graph import DebuggerGraph

class DebuggerMultiGraph(DebuggerGraph):
"""The `DebuggerMultiGraph` object provides interfaces to describe a debugger multigraph."""

def add_graph(self, graph_dict):
"""
add graphs to DebuggerMultiGraph
Args:
graph_dict (dict): The <graph_name, graph_object> dict.
"""
if len(graph_dict) == 1:
graph = list(graph_dict.values())[0]
self._normal_node_map = graph.normal_node_map
self._node_id_map_name = graph.node_id_map_name
self._const_node_temp_cache = graph.const_node_temp_cache
self._parameter_node_temp_cache = graph.parameter_node_temp_cache
self._leaf_nodes = graph.leaf_nodes
self._full_name_map_name = graph.full_name_map_name
else:
for graph_name, graph in graph_dict.items():
log.debug("add graph %s into whole graph.", graph_name)

# add nodes
normal_nodes = copy.deepcopy(graph.normal_node_map)
for _, node_obj in normal_nodes.items():
pre_scope = graph_name + "/"
node_obj.name = pre_scope + node_obj.name
node_obj.full_name = pre_scope + node_obj.full_name
if node_obj.scope:
node_obj.scope = pre_scope + node_obj.scope
else:
node_obj.scope = graph_name

# update inputs
old_inputs = copy.deepcopy(node_obj.inputs)
for src_name, input_attr in old_inputs.items():
new_src_name = graph_name + "/" + src_name
node_obj.add_inputs(new_src_name, input_attr)
node_obj.delete_inputs(src_name)

# update_outputs
old_outputs = copy.deepcopy(node_obj.outputs)
for dst_name, output_attr in old_outputs.items():
new_dst_name = graph_name + "/" + dst_name
node_obj.add_outputs(new_dst_name, output_attr)
node_obj.delete_outputs(dst_name)

self._cache_node(node_obj)

# add graph_node
node = Node(name=graph_name, node_id=graph_name)
node.type = NodeTypeEnum.NAME_SCOPE.value
node.subnode_count = len(graph.list_node_by_scope())
self._cache_node(node)

self._leaf_nodes = self._get_leaf_nodes()
self._full_name_map_name = self._get_leaf_node_full_name_map()

log.info(
"Build multi_graph end, all node count: %s, const count: %s, parameter count: %s.",
self.normal_node_count, len(self._const_node_temp_cache),
len(self._parameter_node_temp_cache))

+ 143
- 0
mindinsight/debugger/stream_cache/node_type_identifier.py View File

@@ -0,0 +1,143 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""This file is used to identify the type of the node."""
import sys

from mindinsight.datavisual.data_transform.graph import NodeTypeEnum
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError

_ACTIVATIONS = [
'Softmax',
'LogSoftmax',
'ReLU',
'ReLU6',
'Tanh',
'GELU',
'ELU',
'Sigmoid',
'PReLU',
'LeakyReLU',
'HSwish',
'HSigmoid',
'LogSigmoid'
]


class NodeTypeIdentifier:
"""Node type identifier."""

def __init__(self, node_type):
self.identify_func = self.get_identify_func(node_type)

@staticmethod
def get_identify_func(node_type):
"""
Get the identify function in this module.

Args:
node_type (str): The node type.

Returns:
function, the identify function.
"""
# the name of the identity function should start with 'is_' and end with '_node'
target_name = 'is_' + node_type + '_node'
cur_module = sys.modules[__name__]
for sub_module in dir(cur_module):
# the rule to get the identify function
if sub_module == target_name:
return getattr(cur_module, sub_module)
raise DebuggerParamValueError("Invalid identify type.")

def is_match(self, *args, **kwargs):
"""Check if the input match the idenfity function."""
return self.identify_func(*args, **kwargs)


def is_weight_node(node):
"""
Check if the node is weight type.

Args:
node (Node): The node object.

Returns:
bool, if the node is weight type.
"""
if node.type == NodeTypeEnum.PARAMETER.value:
node_name = node.name.lower()
weight_flag = False
if node_name.endswith('.weight') or node_name.endswith('.bias'):
weight_flag = True
if weight_flag and 'optimizer-' not in node_name and not node_name.startswith('gradients/'):
return True
return False


def is_activation_node(node, condition=None):
"""
Check if the node is activation type.

Args:
node (Node): The node object.
condition (dict): Filter condition.

- activation_func (Union[str, list[str]): The target functions.

Returns:
bool, if the node is activation type.
"""
activation_funcs = condition.get('activation_func') if condition else _ACTIVATIONS
if not activation_funcs:
activation_funcs = _ACTIVATIONS
if not isinstance(activation_funcs, list):
activation_funcs = [activation_funcs]

if not is_gradient_node(node):
node_type = node.type
for activation_name in activation_funcs:
if node_type == activation_name:
return True
return False


def is_gradient_node(node):
"""
Check if the node is gradient type.

Args:
node (Node): The node object.

Returns:
bool, if the node is gradient type.
"""
if node.name.startswith('Gradients/') and node.type != NodeTypeEnum.PARAMETER.value:
return True
return False


def is_tensor_node(node):
"""
Check if the node is tensor type.

Args:
node (Node): The node object.

Returns:
bool, if the node is tensor type.
"""
if node is not None:
return True
return False

+ 13
- 1
mindinsight/debugger/stream_cache/tensor.py View File

@@ -19,7 +19,7 @@ import numpy as np

from mindinsight.utils.tensor import TensorUtils
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import NUMPY_TYPE_MAP
from mindinsight.debugger.proto.ms_graph_pb2 import DataType

@@ -177,6 +177,18 @@ class OpTensor(BaseTensor):

return res

def get_tensor_statistics(self):
"""
Get Tensor statistics.

Returns:
dict, overall statistics.
"""
if not self._stats:
self._stats = TensorUtils.get_statistics_from_tensor(self.value)
statistics = TensorUtils.get_overall_statistic_dict(self._stats)
return statistics

def update_tensor_comparisons(self, tensor_comparison):
"""
Update tensor comparison for tensor.


+ 77
- 45
mindinsight/debugger/stream_cache/watchpoint.py View File

@@ -13,23 +13,45 @@
# limitations under the License.
# ============================================================================
"""Define the watchpoint stream."""
from mindinsight.datavisual.data_transform.graph.node import NodeTypeEnum
from mindinsight.conditionmgr.common.utils import NodeBasicInfo
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import is_scope_type
from mindinsight.debugger.proto.debug_grpc_pb2 import SetCMD, WatchCondition
from mindinsight.conditionmgr.condition import ConditionIdEnum


WATCHPOINT_CONDITION_MAPPING = {
'INF': WatchCondition.Condition.inf,
'NAN': WatchCondition.Condition.nan,
'OVERFLOW': WatchCondition.Condition.overflow,
'MAX_GT': WatchCondition.Condition.max_gt,
'MAX_LT': WatchCondition.Condition.max_lt,
'MIN_GT': WatchCondition.Condition.min_gt,
'MIN_LT': WatchCondition.Condition.min_lt,
'MAX_MIN_GT': WatchCondition.Condition.max_min_gt,
'MAX_MIN_LT': WatchCondition.Condition.max_min_lt,
'MEAN_GT': WatchCondition.Condition.mean_gt,
'MEAN_LT': WatchCondition.Condition.mean_lt
ConditionIdEnum.NAN.value: WatchCondition.Condition.nan,
ConditionIdEnum.INF.value: WatchCondition.Condition.inf,
ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value: WatchCondition.Condition.overflow,
ConditionIdEnum.MAX_GT.value: WatchCondition.Condition.max_gt,
ConditionIdEnum.MAX_LT.value: WatchCondition.Condition.max_lt,
ConditionIdEnum.MIN_GT.value: WatchCondition.Condition.min_gt,
ConditionIdEnum.MIN_LT.value: WatchCondition.Condition.min_lt,
ConditionIdEnum.MAX_MIN_GT.value: WatchCondition.Condition.max_min_gt,
ConditionIdEnum.MAX_MIN_LT.value: WatchCondition.Condition.max_min_lt,
ConditionIdEnum.MEAN_GT.value: WatchCondition.Condition.mean_gt,
ConditionIdEnum.MEAN_LT.value: WatchCondition.Condition.mean_lt,
ConditionIdEnum.TENSOR_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow,
ConditionIdEnum.WEIGHT_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow,
ConditionIdEnum.OPERATOR_OVERFLOW.value: WatchCondition.Condition.overflow,
ConditionIdEnum.TENSOR_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization,
ConditionIdEnum.WEIGHT_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization,
ConditionIdEnum.TENSOR_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large,
ConditionIdEnum.WEIGHT_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large,
ConditionIdEnum.GRADIENT_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large,
ConditionIdEnum.GRADIENT_EXPLODING.value: WatchCondition.Condition.tensor_general_overflow,
ConditionIdEnum.TENSOR_TOO_SMALL.value: WatchCondition.Condition.tensor_too_small,
ConditionIdEnum.WEIGHT_TOO_SMALL.value: WatchCondition.Condition.tensor_too_small,
ConditionIdEnum.GRADIENT_VANISHING.value: WatchCondition.Condition.tensor_too_small,
ConditionIdEnum.TENSOR_ALL_ZERO.value: WatchCondition.Condition.tensor_all_zero,
ConditionIdEnum.TENSOR_CHANGE_TOO_LARGE.value: WatchCondition.Condition.tensor_change_too_large,
ConditionIdEnum.WEIGHT_CHANGE_TOO_LARGE.value: WatchCondition.Condition.tensor_change_too_large,
ConditionIdEnum.TENSOR_CHANGE_TOO_SMALL.value: WatchCondition.Condition.tensor_change_too_small,
ConditionIdEnum.WEIGHT_CHANGE_TOO_SMALL.value: WatchCondition.Condition.tensor_change_too_small,
ConditionIdEnum.TENSOR_NOT_CHANGED.value: WatchCondition.Condition.tensor_not_changed,
ConditionIdEnum.WEIGHT_NOT_CHANGED.value: WatchCondition.Condition.tensor_not_changed
}


@@ -81,10 +103,8 @@ class WatchNodeTree:
def _translate_node_type(node_type):
"""Translate node type to watch node type."""
flag = node_type
if not node_type or node_type == NodeTypeEnum.NAME_SCOPE.value:
if not node_type or is_scope_type(node_type):
flag = 'scope'
elif node_type != NodeTypeEnum.AGGREGATION_SCOPE.value:
flag = 'leaf'
return flag

def get(self, sub_name):
@@ -191,7 +211,7 @@ class Watchpoint:
self._watch_node = other_watchpoint.nodes

def add_nodes(self, nodes):
"""Add node into watchcpoint."""
"""Add node into watchpoint."""
if not nodes:
log.warning("Add empty nodes.")
return
@@ -208,8 +228,7 @@ class Watchpoint:
if not isinstance(nodes, list):
nodes = [nodes]
for node in nodes:
node_name = node.split(':')[0]
self._watch_node.remove_node(node_name)
self._watch_node.remove_node(node.name)

def get_node_status(self, node_name, node_type, full_name):
"""Judge if the node is in watch nodes."""
@@ -229,40 +248,56 @@ class Watchpoint:

return status

def get_watch_node(self, cur_watch_node, watch_node_list):
def _get_watch_node(self, cur_watch_node, watch_node_list):
"""
Traverse the watch nodes and add total watched node list to `watch_node_list`.

Args:
cur_watch_node (WatchNodeTree): The current watch node.
watch_node_list (list[WatchNodeTree]): The list of total watched node.
watch_node_list (list[NodeBasicInfo]): The list of watch node basic infos.
"""
if cur_watch_node.watch_status == WatchNodeTree.TOTAL_WATCH and \
cur_watch_node.node_type != NodeTypeEnum.AGGREGATION_SCOPE.value:
watch_node_list.append(cur_watch_node)
if cur_watch_node.watch_status == WatchNodeTree.TOTAL_WATCH:
node_info = NodeBasicInfo(name=cur_watch_node.node_name,
full_name=cur_watch_node.full_name,
type=cur_watch_node.node_type)
watch_node_list.append(node_info)
return
for _, watch_node in cur_watch_node.get_children():
self.get_watch_node(watch_node, watch_node_list)
self._get_watch_node(watch_node, watch_node_list)

def get_set_cmd(self):
"""Return the watchpoint in proto format."""
# get watch nodes.
def get_watch_nodes(self):
"""
Get the name of all total watched nodes.

Returns:
list[NodeBasicInfo], the list of watch node basic infos.
"""
watch_nodes = []
self.get_watch_node(self._watch_node, watch_nodes)
self._get_watch_node(self._watch_node, watch_nodes)
return watch_nodes

def get_pending_cmd(self, watch_nodes):
"""Return the watchpoint in proto format."""
# construct SetCMD
set_cmd = SetCMD()
set_cmd.id = self._id
set_cmd.delete = False
set_cmd.watch_condition.condition = WATCHPOINT_CONDITION_MAPPING.get(
self._condition.get('condition'))
if self._condition.get('param'):
self._condition.get('id'))
for param in self._condition.get('params'):
# at most one param is provided
set_cmd.watch_condition.value = self._condition.get('param')
param_proto = set_cmd.watch_condition.params.add()
param_proto.name = param.get('name')
param_proto.value = param.get('value')
param_proto.disabled = param.get('disable')

# Only one parameter of condition in current version.
set_cmd.watch_condition.value = param.get('value')

for watch_node in watch_nodes:
event_node = set_cmd.watch_nodes.add()
event_node.node_name = watch_node.full_name
event_node.node_type = watch_node.node_type

event_node.node_type = watch_node.type
return set_cmd

def get_watch_condition_info(self):
@@ -277,22 +312,17 @@ class Watchpoint:
class WatchpointHit:
"""The watchpoint hit structure."""

def __init__(self, tensor_proto, watchpoint, node_name):
self._node_name = node_name
def __init__(self, tensor_proto, watchpoint, node_name, graph_name):
self._full_name = tensor_proto.node_name
self._slot = tensor_proto.slot
self._watchpoint = watchpoint
self.node_name = node_name
self.slot = tensor_proto.slot
self.graph_name = graph_name

@property
def tensor_full_name(self):
"""The property of tensor full name."""
tensor_name = ':'.join([self._full_name, self._slot])
return tensor_name

@property
def tensor_name(self):
"""The property of tensor ui name."""
tensor_name = ':'.join([self._node_name, self._slot])
tensor_name = ':'.join([self._full_name, self.slot])
return tensor_name

@property
@@ -303,5 +333,7 @@ class WatchpointHit:

def __eq__(self, other):
"""Define the equal condition."""
flag = self.tensor_full_name == other.tensor_full_name and self.watchpoint == other.watchpoint
flag = self.tensor_full_name == other.tensor_full_name \
and self.watchpoint == other.watchpoint \
and self.graph_name == other.graph_name
return flag

+ 1
- 1
mindinsight/debugger/stream_handler/event_handler.py View File

@@ -18,7 +18,7 @@ from queue import Queue, Empty
from threading import Lock

from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.stream_handler.base_handler import StreamHandlerBase




+ 472
- 104
mindinsight/debugger/stream_handler/graph_handler.py View File

@@ -13,10 +13,14 @@
# limitations under the License.
# ============================================================================
"""Define the graph stream handler."""
from mindinsight.conditionmgr.common.utils import NodeBasicInfo
from mindinsight.conditionmgr.condition import TargetTypeEnum as CategoryTypeEnum
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError, \
DebuggerNodeNotInGraphError, DebuggerGraphNotExistError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import is_scope_type
from mindinsight.debugger.stream_cache.debugger_graph import DebuggerGraph
from mindinsight.debugger.stream_cache.debugger_multigraph import DebuggerMultiGraph
from mindinsight.debugger.stream_handler.base_handler import StreamHandlerBase


@@ -24,16 +28,41 @@ class GraphHandler(StreamHandlerBase):
"""Metadata Handler."""

def __init__(self):
self._graph_proto = None
self._graph = None
self._searched_node_list = []
# dict of <graph_name, GraphProto object>
self._graph_proto = {}
# dict of <graph_name, DebuggerGraph object>
self._graph = {}
self._searched_node_list = {}
# list of node names in bfs order
self.bfs_order = []
# dict of <node full name, graph_name>
self.graph_node_map = {}
# dict of <node ui name, Node object> for all graphs
self._all_leaf_nodes = {}

# the whole graph
self._whole_graph = None

@property
def whole_graph(self):
"""The property of whole_graph."""
return self._whole_graph

@property
def graph(self):
"""The property of graph."""
return self._graph_proto

@property
def graph_names(self):
"""The property of graph names."""
return list(self._graph)

@property
def debugger_graph_obj(self):
"""The property of graph object."""
return self._graph

def put(self, value):
"""
Put value into graph cache. Called by grpc server.
@@ -41,14 +70,23 @@ class GraphHandler(StreamHandlerBase):
Args:
value (GraphProto): The Graph proto message.
"""
self._graph_proto = value
log.info("Put graph into cache.")

# build graph
graph = DebuggerGraph()
graph.build_graph(value)
self._graph = graph
self.bfs_order = self._graph.get_bfs_order()
for graph_name, graph_value in value.items():
self._graph_proto[graph_name] = graph_value
# build sub graph
graph = DebuggerGraph()
graph.build_graph(graph_value)
self._graph[graph_name] = graph
self.bfs_order.extend(graph.get_bfs_order())
leaf_nodes = graph.leaf_nodes
self._all_leaf_nodes.update(leaf_nodes)
for _, node in leaf_nodes.items():
self.graph_node_map[node.full_name] = graph_name

# build whole graph
graph = DebuggerMultiGraph()
graph.add_graph(self._graph)
self._whole_graph = graph

def get(self, filter_condition=None):
"""
@@ -58,7 +96,7 @@ class GraphHandler(StreamHandlerBase):
filter_condition (dict):

- name (str): The full debug node name.
- graph_name (str): The relative graph_name of the node.
- single_node (bool): If True, return the graph from root
to the specific node; else, return the sublayer of the
graph. Default: False.
@@ -73,47 +111,121 @@ class GraphHandler(StreamHandlerBase):
'please start the training script first.')
return {'graph': {}}

graph = {}
if filter_condition is None:
filter_condition = {}
graph = {'graph_names': self.graph_names}

single_node = filter_condition.get('single_node', False)
name = filter_condition.get('name')

graph = {}
graph_name = filter_condition.get('graph_name')
if single_node is True:
nodes = self.get_single_node(name)
nodes = self._get_single_node(name, graph_name)
else:
nodes = self.list_nodes(name)
nodes = self._list_nodes(name, graph_name)
graph.update(nodes)

return {'graph': graph}

def get_tensor_history(self, node_name, depth=0):
def _get_single_node(self, name, graph_name=None):
"""
Search node, and return every layer nodes until this node.

Args:
graph_name(str): The graph_name.
name (str): The name of node.

Returns:
dict, every layer nodes until this node.
"""
if graph_name:
graph = self._get_graph(graph_name=graph_name)
searched_graph = graph.search_single_node(name)
else:
searched_graph = self._whole_graph.search_single_node(name)

return searched_graph

def _list_nodes(self, scope, graph_name):
"""
Get the nodes of every layer in graph.

Args:
scope (str): The name of a scope.
graph_name(str): The graph name.

Returns:
TypedDict{'nodes': ['Node_1', ...], 'graph_names': ['graph_name_1', ...]},
format is {'nodes': [<NodeObject>], 'graph_names': [<str>]}.
example:
{
"nodes" : [
{
"attr" :
{
"index" : "i: 0\n"
},
"input" : {},
"name" : "input_tensor",
"output" :
{
"Default/TensorAdd-op17" :
{
"edge_type" : "data",
"scope" : "name_scope",
"shape" : [1, 16, 128, 128]
}
},
"output_i" : -1,
"proxy_input" : {},
"proxy_output" : {},
"independent_layout" : False,
"subnode_count" : 0,
"type" : "Data"
}
]
}
"""
if graph_name:
graph = self._get_graph(graph_name, scope)
nodes = graph.list_node_by_scope(scope=scope)
res = {'nodes': nodes}
else:
nodes = self._whole_graph.list_node_by_scope(scope=scope)
res = {'nodes': nodes}

return res

def get_tensor_history(self, node_name, graph_name=None, depth=0):
"""
Get the tensor history of a specified node.

Args:
node_name (str): The debug name of the node.
graph_name (str): The graph_name. Default: None.
depth (int): The number of layers the user
wants to trace. Default is 0.

Returns:
dict, basic tensor history, only including tensor name and tensor type and node type.
"""
self._graph_exists()
if not self._graph.exist_node(node_name):
raise DebuggerNodeNotInGraphError(node_name)

tensor_history, cur_outputs_nums = self._graph.get_tensor_history(
node_name, depth
)
graph_name, node_name = self._parse_node_name(node_name, graph_name)
graph = self._get_graph(graph_name=graph_name, node_name=node_name)
# validate node type, scope node has no tensor history
node_type = graph.get_node_type(node_name)
if is_scope_type(node_type):
log.error("Scope type node has no tensor history.")
raise DebuggerParamValueError("Invalid leaf node name.")
# get tensor history
tensor_history, cur_outputs_nums = graph.get_tensor_history(node_name, depth)
# add the tensor type for tensor history
self._update_tensor_history(tensor_history[0:cur_outputs_nums], 'output')
self._update_tensor_history(tensor_history[cur_outputs_nums:], 'input')
self._update_tensor_history(tensor_history[0:cur_outputs_nums], 'output', graph_name)
self._update_tensor_history(tensor_history[cur_outputs_nums:], 'input', graph_name)
log.debug("Get %d tensors in tensor history for node <%s>.", len(tensor_history), node_name)
return {'tensor_history': tensor_history}

@staticmethod
def _update_tensor_history(tensor_history, tensor_type):
def _update_tensor_history(tensor_history, tensor_type, graph_name):
"""
Add tensor source type for tensor history.

@@ -122,115 +234,285 @@ class GraphHandler(StreamHandlerBase):
keys: `node_type` and `name`. `node_type` refers to the type of the node which
the tensor come from. `name` refers to the tensor name.
tensor_type (str): The source type of the tensor. `input` or `output`.
graph_name (str): The graph name.
"""
for single_tensor_info in tensor_history:
single_tensor_info['type'] = tensor_type
single_tensor_info['graph_name'] = graph_name

def search_nodes(self, pattern):
"""
Search nodes by given pattern.

Args:
pattern (Union[str, None]): The pattern of the node to search,
if None, return all node names.
pattern (dict): Filter condition.

- name (str): The name pattern.
- graph_name (str): The graph name.
- node_category (str): The node_category. Default: None
- condition (dict): The additional filter condition.

Returns:
dict, the searched node.
"""
self._graph_exists()
self._searched_node_list = self._graph.search_nodes_by_pattern(pattern)
nodes = self._graph.get_nodes(self._searched_node_list)
graph_name = pattern.pop('graph_name', None)
search_nodes = self.get_searched_nodes(pattern, graph_name)
# construct to search tree
if not self._has_graph_scope(graph_name):
for graph_name, searched_node_list in search_nodes.items():
graph = self._get_graph(graph_name=graph_name)
format_nodes = graph.get_nodes(searched_node_list)
return {'nodes': format_nodes}
# deal with graph_name is None
res = []
for graph_name, graph in self._graph.items():
format_nodes = graph.get_nodes(search_nodes.get(graph_name, []))
if not format_nodes:
continue
self._add_graph_scope_for_nodes(format_nodes, graph_name)
search_graph = {
'name': graph_name,
'type': 'name_scope',
'nodes': format_nodes
}
res.append(search_graph)
return {'nodes': res}

def get_searched_node_list(self, pattern, graph_name):
"""Get searched node list in single graph."""
searched_nodes = self.get_searched_nodes(pattern, graph_name)
return searched_nodes.get(graph_name, [])

def get_searched_nodes(self, pattern, graph_name=None):
"""
Search nodes by given pattern.

Args:
pattern (dict): Filter condition.

- name (str): The name pattern.
- node_category (str): The node_category. Default: None
- condition (dict): The additional filter condition.
graph_name (str): The graph name. If not given, search in all sub graphs. Default: None.

Returns:
dict, the searched nodes. The format is dict of <graph_name, list[Node]>.
"""
if not graph_name:
graph_names = self.graph_names
else:
graph_names = [graph_name]
search_nodes = {}
for sub_graph_name in graph_names:
search_nodes[sub_graph_name] = self._search_in_single_graph(pattern, sub_graph_name)
return search_nodes

return {'nodes': nodes}
def _search_in_single_graph(self, pattern, graph_name=None):
"""
Search nodes by given pattern.

def get_nodes_by_scope(self, scope_name):
Args:
pattern (dict): Filter condition.

- name (str): The name pattern.
- node_category (str): The node_category. Default: None.
- condition (dict): The additional filter condition.
graph_name (str): The graph name.

Returns:
list, the searched node list.
"""
temp_node_list = []
node_category = pattern.get('node_category')
if graph_name:
graph = self._get_graph(graph_name=graph_name)
else:
graph = self._whole_graph
# filter nodes by name
if pattern.get('name'):
if node_category:
# get leaf nodes for forward filter
temp_node_list = graph.search_leaf_nodes_by_pattern(pattern.get('name'))
else:
# optimize search nodes
temp_node_list = graph.search_nodes_by_pattern(pattern.get('name'))
if not temp_node_list:
log.debug("No node named %s", pattern.get('name'))
return []
# filter nodes by category
if node_category:
node_category = self._get_inner_node_category(node_category)
condition = pattern['condition'].copy() if pattern.get('condition') else {}
condition['search_range'] = temp_node_list
temp_node_list = graph.search_nodes_by_category(node_category, condition=condition)
return temp_node_list

@staticmethod
def _get_inner_node_category(node_category):
"""
Get inner node category.

Args:
node_category (str): The node category supported in
mindinsight.conditionmgr.condition.TargetTypeEnum.

Returns:
CategoryTypeEnum, the translated value.
"""
try:
res = CategoryTypeEnum(node_category)
except ValueError as err:
log.error("Invalid node category. %s", err)
raise DebuggerParamValueError("Invalid node_category.")
return res

def get_nodes_by_scope(self, scope_name, graph_name):
"""
Get node by a given scope name.

Args:
scope_name (str): The name of scope.
graph_name (str): The relative graph_name of the watched node. Default: None.

Returns:
list[Node], a list of node.
"""
return self._graph.search_leaf_nodes_by_pattern(scope_name)
if graph_name:
graph = self._get_graph(graph_name)
else:
graph = self._whole_graph
return graph.search_leaf_nodes_by_pattern(scope_name)

def get_graph_id_by_name(self, node_name):
"""
Get graph id by full name.

Args:
node_name (str): The name of the node.

Returns:
str, the graph name of the node.

Raises:
DebuggerNodeNotInGraphError: If can not find the node in all graphs.
"""
if node_name:
for graph_name, sub_graph in self._graph.items():
if sub_graph.exist_node(name=node_name):
return graph_name
log.error('Failed to find node %s in graph. Please make sure the graph has been sent and '
'the node name is correct, and try again.', node_name)
raise DebuggerGraphNotExistError

def get_graph_id_by_full_name(self, node_name):
"""
Get graph id by full name.

def get_searched_node_list(self):
"""Get searched node list."""
return self._searched_node_list
Args:
node_name (str): The full name of the node.

Returns:
str, the graph name of the node.

Raises:
DebuggerNodeNotInGraphError: If can not find the node in all graphs.
"""
graph_id = self.graph_node_map.get(node_name) if node_name else None
if not graph_id:
log.error("Failed to get graph id by full name: %s", node_name)
raise DebuggerNodeNotInGraphError(node_name)
return graph_id

def get_node_type(self, node_name):
def get_node_type(self, node_name, graph_name=None):
"""
Get the type of the specified node.

Args:
node_name (str): The debug name of the node.
graph_name (str): The relative graph_name of the node. Default: None.

Returns:
A string of the node type, name_scope or leaf.
"""
self._graph_exists()
node_type = self._graph.get_node_type(node_name)
if graph_name:
graph = self._get_graph(node_name=node_name, graph_name=graph_name)
else:
graph = self._whole_graph
node_type = graph.get_node_type(node_name)

return node_type

def get_full_name(self, node_name):
def get_full_name(self, node_name, graph_name=None):
"""Get full name according to ui node name."""
full_name = self._graph.get_full_name_by_node_name(node_name) if node_name else ''
full_name = ''
if node_name:
if graph_name:
graph = self._get_graph(node_name=node_name, graph_name=graph_name)
else:
graph = self._whole_graph
full_name = graph.get_full_name_by_node_name(node_name)

return full_name

def get_node_name_by_full_name(self, full_name):
"""Get UI node name by full name."""
if self._graph:
node_name = self._graph.get_node_name_by_full_name(full_name)
else:
node_name = ''
log.info("No graph received yet.")
return node_name
def get_node_basic_info(self, node_name, graph_name):
"""Get node basic info with graph scope."""
graph_name, node_name = self._parse_node_name(node_name=node_name, graph_name=graph_name)
graph = self._get_graph(graph_name, node_name)
full_name = graph.get_full_name_by_node_name(node_name)
node_type = graph.get_node_type(node_name)
return self.construct_node_basic_info(full_name, graph_name, node_name, node_type)

def list_nodes(self, scope):
def get_tensor_graph(self, tensor_name, graph_name):
"""
Get the nodes of every layer in graph.
Get tensor graph according to node name.

Args:
scope (str): The name of a scope.
tensor_name (str): Tensor name, format is "node_name:<node_value>".
graph_name (str): The relative graph_name of the node. Default: None.

Returns:
TypedDict('Nodes', {'nodes': list[Node]}), format is {'nodes': [<Node object>]}.
example:
{
"nodes" : [
{
"attr" :
{
"index" : "i: 0\n"
},
"input" : {},
"name" : "input_tensor",
"output" :
{
"Default/TensorAdd-op17" :
{
"edge_type" : "data",
"scope" : "name_scope",
"shape" : [1, 16, 128, 128]
}
},
"output_i" : -1,
"proxy_input" : {},
"proxy_output" : {},
"independent_layout" : False,
"subnode_count" : 0,
"type" : "Data"
}
]
}
dict, relative node.
"""
node_name, _ = tensor_name.rsplit(':', 1)
graph = self._get_graph(graph_name=graph_name, node_name=node_name)
tensor_graph = graph.get_tensor_graph(node_name)
return {'graph': tensor_graph}

@staticmethod
def construct_node_basic_info(full_name, graph_name, node_name, node_type):
"""Construct node basic info."""
node_name_with_graph_scope = '/'.join([graph_name, node_name]) if node_name else graph_name
return NodeBasicInfo(name=node_name_with_graph_scope, full_name=full_name, type=node_type)

def get_node_basic_info_by_scope(self, scope_name, graph_name):
"""
if scope and not self._graph.exist_node(scope):
raise DebuggerNodeNotInGraphError(node_name=scope)
Get node by a given scope name.

nodes = self._graph.list_node_by_scope(scope=scope)
return {'nodes': nodes}
Args:
scope_name (str): The name of scope.
graph_name (str): The relative graph_name of the watched node. Default: None.

Returns:
list[NodeBasicInfo], a list of node.
"""
graph_name, node_name = self._parse_node_name(scope_name, graph_name)
graph = self._get_graph(graph_name)
nodes = graph.search_leaf_nodes_by_pattern(node_name)
res = [self.construct_node_basic_info(full_name=node.full_name,
graph_name=graph_name,
node_name=node.name,
node_type=node.type) for node in nodes]
return res

def get_node_name_by_full_name(self, full_name, graph_name):
"""Get UI node name by full name and graph name."""
if graph_name and full_name:
graph = self._get_graph(graph_name)
node_name = graph.get_node_name_by_full_name(full_name)
else:
node_name = ''
log.debug("Get empty full name.")
return node_name

def get_node_by_bfs_order(self, node_name=None, ascend=True):
"""
@@ -240,11 +522,9 @@ class GraphHandler(StreamHandlerBase):
node_name (str): The name of current chosen leaf node.
ascend (bool): If True, traverse the input nodes;
If False, traverse the output nodes. Default is True.

Returns:
Union[None, dict], the next node object in dict type or None.
"""
self._graph_exists()
bfs_order = self.bfs_order
length = len(bfs_order)

@@ -269,11 +549,11 @@ class GraphHandler(StreamHandlerBase):
f'Please check the node name {err}.'
raise DebuggerParamValueError(msg)

next_node = self.get_next_node_in_bfs(index, length, ascend)
next_node = self._get_next_node_in_bfs(index, length, ascend)

return next_node

def get_next_node_in_bfs(self, index, length, ascend):
def _get_next_node_in_bfs(self, index, length, ascend):
"""
Get the next node in bfs order.

@@ -294,28 +574,116 @@ class GraphHandler(StreamHandlerBase):

return next_node

def get_single_node(self, name):
def _graph_exists(self):
"""
Search node, and return every layer nodes until this node.
Check if the graph has been loaded in the debugger cache.

Raises:
DebuggerGraphNotExistError: If the graph does not exist.
"""
if not self._graph:
log.error('The graph does not exist. Please start the '
'training script and try again.')
raise DebuggerGraphNotExistError

def _get_graph(self, graph_name=None, node_name=None):
"""
Get the graph object according to graph name and node name.

Args:
name (str): The name of node.
graph_name (str): The graph name.
node_name (str): The node name.

Returns:
dict, every layer nodes until this node.
DebuggerGraph, the graph object.

Raises:
DebuggerGraphNotExistError: If the graph does not exist.
"""
nodes = self._graph.search_single_node(name)
if not graph_name and not node_name and len(self._graph) == 1:
# get the graph if there is only one graph
return list(self._graph.values())[0]
graph_name = graph_name if graph_name else self.get_graph_id_by_name(node_name)
graph = self._graph.get(graph_name) if graph_name else None
# get graph according to graph name and check the node
if graph and (not node_name or graph.exist_node(name=node_name)):
return graph
log.error('The graph %s does not exist node %s.', graph_name, node_name)
raise DebuggerGraphNotExistError

def _has_graph_scope(self, graph_name):
"""Check if query with graph_scope."""
return bool(graph_name is None and len(self._graph) > 1)

def validate_graph_name(self, graph_name):
"""Validate graph_name."""
if graph_name and self._graph.get(graph_name) is None:
log.error("No graph named %s in debugger cache.", graph_name)
raise DebuggerGraphNotExistError
if not graph_name and len(self._graph) == 1:
graph_name = self.graph_names[0]
return graph_name

return nodes

def _graph_exists(self):
def _add_graph_scope_for_nodes(self, nodes, graph_name):
"""
Check if the graph has been loaded in the debugger cache.
Add graph scope for nodes.

Args:
nodes (list[Node]): List of nodes object.
graph_name (str): The graph name.
"""
def _get_updated_node_info(cur_node, node_type):
"""Add graph scope in key."""
old_node = cur_node.get(node_type)
if not old_node:
return
new_values = {}
for old_name, node_info in old_node.items():
new_name = '/'.join([graph_name, old_name]) if old_name else graph_name
new_values[new_name] = node_info
cur_node[node_type] = new_values

for node in nodes:
node['name'] = '/'.join([graph_name, node['name']]) if node['name'] else graph_name
_get_updated_node_info(node, 'input')
_get_updated_node_info(node, 'output')
if node.get('nodes'):
self._add_graph_scope_for_nodes(node.get('nodes'), graph_name)

def _parse_node_name(self, node_name, graph_name):
"""
Check if the node name should have graph scope.

Args:
node_name (str): The ui node name.
graph_name (str): The graph name.

Returns:
str, parsed graph name.
str, parsed node name.
"""
node_name = '' if node_name is None else node_name
if self._has_graph_scope(graph_name):
names = node_name.split("/", 1)
graph_name = names[0]
node_name = names[1] if len(names) == 2 else ''
if graph_name is None and len(self._graph) == 1:
graph_name = self.graph_names[0]
return graph_name, node_name

def validate_node_name(self, node_name, graph_name):
"""
Validate the graph exist the specified node.

Args:
node_name (str): The ui node name.
graph_name (str): The graph name.

Raises:
DebuggerGraphNotExistError: If the graph does not exist.
DebuggerNodeNotInGraphError: If can not find the node in all graphs.
"""
if self._graph is None:
log.error('The graph does not exist. Please start the '
'training script and try again.')
raise DebuggerGraphNotExistError
graph = self._get_graph(graph_name=graph_name)
if not graph.exist_node(name=node_name):
log.error("graph %s doesn't find node: %s.", graph_name, node_name)
raise DebuggerNodeNotInGraphError(node_name)

+ 37
- 5
mindinsight/debugger/stream_handler/metadata_handler.py View File

@@ -13,7 +13,7 @@
# limitations under the License.
# ============================================================================
"""Define the metadata stream handler."""
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import ServerStatus
from mindinsight.debugger.stream_handler.base_handler import StreamHandlerBase

@@ -29,6 +29,8 @@ class MetadataHandler(StreamHandlerBase):
self._cur_node_name = ""
self._cur_full_name = ""
self._backend = ""
self._enable_recheck = False
self._cur_graph_name = ""

@property
def device_name(self):
@@ -50,6 +52,16 @@ class MetadataHandler(StreamHandlerBase):
"""The property of current node name."""
self._cur_node_name = node_name

@property
def graph_name(self):
"""The property of current node name."""
return self._cur_graph_name

@graph_name.setter
def graph_name(self, graph_name):
"""The property of current node name."""
self._cur_graph_name = graph_name if graph_name else ''

@property
def full_name(self):
"""The property of current node name."""
@@ -90,6 +102,21 @@ class MetadataHandler(StreamHandlerBase):
"""
self._client_ip = str(value)

@property
def enable_recheck(self):
"""The property of enable_recheck."""
return self._enable_recheck and self._state == ServerStatus.WAITING and self._step > 0

@enable_recheck.setter
def enable_recheck(self, value):
"""
Set the property of enable_recheck.

Args:
value (bool): The new ip.
"""
self._enable_recheck = bool(value)

def put(self, value):
"""
Put value into metadata cache. Called by grpc server.
@@ -108,7 +135,7 @@ class MetadataHandler(StreamHandlerBase):
Get updated value. Called by main server.

Args:
filter_condition (str): The filter property.
filter_condition (Union[str, list[str]]): The filter property.

Returns:
dict, the metadata.
@@ -122,10 +149,15 @@ class MetadataHandler(StreamHandlerBase):
'pos': '0',
'ip': self.client_ip,
'node_name': self.node_name,
'backend': self.backend
'backend': self.backend,
'enable_recheck': self.enable_recheck,
'graph_name': self.graph_name
}
else:
metadata[filter_condition] = getattr(self, filter_condition) if \
hasattr(self, filter_condition) else ''
if not isinstance(filter_condition, list):
filter_condition = [filter_condition]
for field in filter_condition:
metadata[field] = getattr(self, field) if \
hasattr(self, field) else None

return {'metadata': metadata}

+ 59
- 14
mindinsight/debugger/stream_handler/tensor_handler.py View File

@@ -17,7 +17,7 @@ import numpy as np

from mindinsight.datavisual.data_transform.graph.node import NodeTypeEnum
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.proto.ms_graph_pb2 import DataType
from mindinsight.debugger.stream_cache.tensor import OpTensor, ConstTensor
from mindinsight.debugger.stream_handler.base_handler import StreamHandlerBase
@@ -32,6 +32,16 @@ class TensorHandler(StreamHandlerBase):
self._tensors = {}
self._cur_step = 0

@property
def cur_step(self):
"""The property of current step."""
return self._cur_step

@property
def prev_step(self):
"""The property of previous step."""
return self._cur_step - 1

def put(self, value):
"""
Put value into tensor cache. Called by grpc server.
@@ -98,7 +108,7 @@ class TensorHandler(StreamHandlerBase):
self._tensors[tensor.name] = cache_tensor

old_tensor = cache_tensor.get(step)
if old_tensor and not self.is_value_diff(old_tensor.value, tensor.value):
if old_tensor and not self._is_value_diff(old_tensor.value, tensor.value):
log.debug("Tensor %s of step %s has no change. Ignore it.", tensor.name, step)
return False
cache_tensor[step] = tensor
@@ -106,7 +116,7 @@ class TensorHandler(StreamHandlerBase):
return True

@staticmethod
def is_value_diff(old_value, new_value):
def _is_value_diff(old_value, new_value):
"""Check tensor value if there are equal."""
log.debug("old value type: %s, new_value type: %s", type(old_value), type(new_value))
if old_value is None and new_value is None:
@@ -142,22 +152,28 @@ class TensorHandler(StreamHandlerBase):
Args:
filter_condition (dict): Filter condition.

- name (str): The name of tensor.
- name (str): The full name of tensor.

- node_type (str): The type of the node.

- prev (bool): Whether to get previous tensor.

Returns:
dict, the tensor_value.
"""
name = filter_condition.get('name')
node_type = filter_condition.get('node_type')
shape = filter_condition.get('shape')
tensor = self._get_tensor(name, node_type)
if filter_condition.get('prev'):
step = self.prev_step
else:
step = self.cur_step
tensor = self._get_tensor(name, node_type, step)
if not tensor:
log.error("No tensor named %s", name)
log.error("No tensor named %s at the step %s", name, step)
raise DebuggerParamValueError("No tensor named {}".format(name))
tensor_info = tensor.get_full_info(shape)
self._update_has_prev_step_field(tensor_info, name, node_type)
self._update_has_prev_step_field(tensor_info, name, node_type, step)
return {'tensor_value': tensor_info}

def _get_tensor(self, tensor_name, node_type=None, step=None):
@@ -167,7 +183,7 @@ class TensorHandler(StreamHandlerBase):
Args:
tensor_name (str): Tensor name, format like `node_name:slot`.
node_type (str): Node type.
step (int): The step of tensor info. Default: None. Noe
step (int): The step of tensor info. Default: None.

Returns:
Union[OPTensor, ConstTensor], the tensor object.
@@ -178,7 +194,8 @@ class TensorHandler(StreamHandlerBase):
if not tensor and node_type == NodeTypeEnum.CONST.value:
const_name = tensor_name.rsplit('/', 1)[-1]
tensor = self._const_vals.get(const_name)
self._tensors[tensor_name] = {step: tensor}
if tensor:
self._tensors[tensor_name] = {step: tensor}

return tensor

@@ -205,7 +222,7 @@ class TensorHandler(StreamHandlerBase):
tensor_name = tensor_info.get('full_name')
node_type = tensor_info.get('node_type')
basic_info = self._get_basic_info(tensor_name, node_type)
flag = self._update_has_prev_step_field(basic_info, tensor_name, node_type)
flag = self._update_has_prev_step_field(basic_info, tensor_name, node_type, self.cur_step)
if flag is False:
missed_tensor = tensor_info.copy()
missed_tensor['iter'] = 'prev'
@@ -223,22 +240,23 @@ class TensorHandler(StreamHandlerBase):

return missed_tensors

def _update_has_prev_step_field(self, tensor_info, tensor_name, node_type):
def _update_has_prev_step_field(self, tensor_info, tensor_name, node_type, step):
"""Update has_prev_step field in tensor info."""
flag = None
cur_tensor_value = bool(tensor_info and tensor_info.get('value') is not None)
if node_type == NodeTypeEnum.PARAMETER.value:
flag = self._get_prev_tensor_value_status(tensor_name)
flag = self._get_prev_tensor_value_status(tensor_name, step)
if flag and cur_tensor_value:
tensor_info['has_prev_step'] = True
return flag

def _get_prev_tensor_value_status(self, tensor_name):
def _get_prev_tensor_value_status(self, tensor_name, step):
"""
Get the status of tensor value of previous step.

Args:
tensor_name (str): Tensor name.
step (int): The step of the tensor.

Returns:
Union[None, bool], the status of previous tensor value. If True, there is valid previous
@@ -247,7 +265,7 @@ class TensorHandler(StreamHandlerBase):
"""
flag = None
# check if the tensor has previous step value.
prev_step = self._cur_step - 1
prev_step = step - 1
if prev_step < 0:
return flag
tensor = self._get_tensor(tensor_name, step=prev_step)
@@ -314,6 +332,8 @@ class TensorHandler(StreamHandlerBase):
tensor_comparison = curr_tensor.tensor_comparison
if not tensor_comparison or tensor_comparison.tolerance != tolerance:
if isinstance(curr_tensor.value, np.ndarray) and isinstance(prev_tensor.value, np.ndarray):
if curr_tensor.value.shape != prev_tensor.value.shape:
raise DebuggerParamValueError("The shape of these two step tensors is not the same.")
tensor_diff = TensorUtils.calc_diff_between_two_tensor(curr_tensor.value, prev_tensor.value, tolerance)
if not tensor_comparison:
stats = TensorUtils.get_statistics_from_tensor(tensor_diff)
@@ -333,9 +353,34 @@ class TensorHandler(StreamHandlerBase):
result = np.stack([prev_tensor_slice, curr_tensor_slice, tensor_diff_slice], axis=-1)
tensor_info['diff'] = result.tolist()
stats = TensorUtils.get_statistics_from_tensor(tensor_diff_slice)
curr_tensor_stats = TensorUtils.get_statistics_from_tensor(curr_tensor.value)
curr_tensor_slice_stats = TensorUtils.get_statistics_from_tensor(curr_tensor_slice)
prev_tensor_stats = TensorUtils.get_statistics_from_tensor(prev_tensor.value)
prev_tensor_slice_stats = TensorUtils.get_statistics_from_tensor(prev_tensor_slice)
tensor_info['curr_step_statistics'] = TensorUtils.get_statistics_dict(stats=curr_tensor_slice_stats,
overall_stats=curr_tensor_stats)
tensor_info['prev_step_statistics'] = TensorUtils.get_statistics_dict(stats=prev_tensor_slice_stats,
overall_stats=prev_tensor_stats)
tensor_info['statistics'] = TensorUtils.get_statistics_dict(stats=stats,
overall_stats=tensor_comparison.stats)
elif isinstance(curr_tensor_slice, str):
tensor_info['diff'] = curr_tensor_slice
reply = {'tensor_value': tensor_info}
return reply

def get_tensor_statistics(self, tensor_name, node_type):
"""
Get Tensor statistics.

Args:
tensor_name (str): Tensor name, format like `node_name:slot`.
node_type (str): Node type.

Returns:
dict, overall statistics.
"""
res = {}
tensor = self._get_tensor(tensor_name, node_type)
if tensor:
res = tensor.get_tensor_statistics()
return res

+ 368
- 93
mindinsight/debugger/stream_handler/watchpoint_handler.py View File

@@ -13,25 +13,37 @@
# limitations under the License.
# ============================================================================
"""Define the watchpoint stream handler."""
import numpy as np

from mindinsight.conditionmgr.condition import ValueTypeEnum
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError, \
DebuggerParamTypeError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import is_scope_type
from mindinsight.debugger.proto.debug_grpc_pb2 import SetCMD
from mindinsight.debugger.stream_cache.watchpoint import Watchpoint, WatchpointHit, \
WATCHPOINT_CONDITION_MAPPING
WatchNodeTree
from mindinsight.debugger.stream_handler.base_handler import StreamHandlerBase


class WatchpointHandler(StreamHandlerBase):
"""watchpoint Handler."""
"""Watchpoint Handler."""

def __init__(self):
self._watchpoints = {}
# list of ids of new created watchpoints
self._created_watchpoints = []
# list of SetCMD of watchpoints to be deleted
self._deleted_watchpoints = []
# dict of <id, SetCMD> of watchpoint to be updated
self._updated_watchpoints = {}
# the collection of watched node full names, which have been sent to MindSpore
self._all_watched_node_full_names = set()
# the collection of new watched node full names, which have not been sent to MindSpore
self._new_watched_node_full_names = set()
# record the temp stored nodes in MS, which could be set as watch node for recheck on GPU
# should be clean at the beginning of each step
self._temp_cached_node_full_names = set()
self._latest_id = 0
self._cache_set_cmd = {}

def put(self, value):
"""
@@ -42,34 +54,50 @@ class WatchpointHandler(StreamHandlerBase):
"""
new_id = value.watchpoint_id
self._watchpoints[new_id] = value
self._created_watchpoints.append(new_id)
self._updated_watchpoints[new_id] = value
self._latest_id = new_id
log.debug("Put watchpoint %d into cache.", new_id)

def sync_set_cmd(self):
def clean_temp_cached_names(self):
"""Clean temp cached node."""
self._temp_cached_node_full_names.clear()

def add_temp_cached_name(self, node_full_name):
"""Add temp stored node in cache."""
if node_full_name:
self._temp_cached_node_full_names.add(node_full_name)

def sync_set_cmd(self, set_cmds):
"""Clean temp watchpoints."""
self._new_watched_node_full_names = set()
self._created_watchpoints = []
self._deleted_watchpoints = []
self._updated_watchpoints = {}
for set_cmd in set_cmds:
self._cache_set_cmd[set_cmd.id] = set_cmd

def clean_cache_set_cmd(self, set_cmd):
"""Clean cache set command."""
self._cache_set_cmd.pop(set_cmd.id, None)

def get_watchpoint_by_id(self, watchpoint_id):
"""Get watchpoint by watchpoint id."""
watchpoint = self._watchpoints.get(watchpoint_id)
if not watchpoint:
log.error("Invalid watchpoint id %d", watchpoint_id)
raise DebuggerParamValueError("Invalid watchpoint id {}".format(watchpoint_id))
res = self.get(watchpoint_id)
watchpoint = res.get('watch_points')[0]

return watchpoint

def get(self, filter_condition=False):
def get(self, filter_condition=None):
"""
Get the watchpoints.

Args:
filter_condition (bool): If True, get all watchpoints without nodes. If False,
get updated watchpoints in SetCMD proto format. Default: False.
filter_condition (Union[None, int]): The filter conditions. Get watchpoint by
id. If None, return all watchpoint. Default: None.

Returns:
dict, the watchpoints.
dict, the watchpoint list.
"""
reply = []
if not filter_condition:
@@ -78,17 +106,85 @@ class WatchpointHandler(StreamHandlerBase):
watchpoint_info = watchpoint.get_watch_condition_info()
reply.append(watchpoint_info)
else:
# get updated watchpoint list
for _, watchpoint in self._updated_watchpoints.items():
set_cmd = watchpoint.get_set_cmd()
reply.append(set_cmd)
reply.extend(self._deleted_watchpoints)
self.validate_watchpoint_id(filter_condition)
reply = [self._watchpoints.get(filter_condition)]

log.debug("get the watch points with filter_condition:%s", filter_condition)

return {'watch_points': reply}

def set_watch_nodes(self, graph, graph_stream, watch_point_id):
def get_pending_commands(self, graph_stream):
"""
Get all watchpoint in SetCMD proto format.

Args:
graph_stream (GraphHandler): Graph handler.

Returns:
list[SetCMD], updated watchpoint to be sent to MindSpore.
"""
res = []
new_watched_nodes = set()
self._all_watched_node_full_names.clear()
for _, watchpoint in self._updated_watchpoints.items():
# construct set command with leaf nodes
watch_nodes = watchpoint.get_watch_nodes()
leaf_watch_nodes = self._expand_to_leaf_nodes(graph_stream, watch_nodes)
res.append(watchpoint.get_pending_cmd(leaf_watch_nodes))
# update all watched node names
watch_node_names = [watch_node.full_name for watch_node in [*watch_nodes, *leaf_watch_nodes]]
new_watched_nodes.update(watch_node_names)
res.extend(self._deleted_watchpoints)
for _, set_cmd in self._cache_set_cmd.items():
res.append(set_cmd)
self._all_watched_node_full_names = new_watched_nodes
return res

@staticmethod
def _expand_to_leaf_nodes(graph_stream, watch_nodes):
"""
Get all leaf node basic info according to watch nodes.

Args:
graph_stream (GraphHandler): Graph handler.
watch_nodes (list[NodeBasicInfo]): The list of watch node basic infos.

Returns:
list[NodeBasicInfo], expanded leaf basic node infos.
"""
leaf_watch_nodes = []
for node in watch_nodes:
if is_scope_type(node.type):
pure_node_name = None
if len(node.name.split('/')) > 1:
graph_name, pure_node_name = node.name.split('/', 1)
else:
graph_name = node.name
search_node_infos = graph_stream.get_node_basic_info_by_scope(pure_node_name, graph_name=graph_name)
leaf_watch_nodes.extend(search_node_infos)
else:
leaf_watch_nodes.append(node)
return leaf_watch_nodes

def is_recheckable(self, backend=None):
"""
Check if current status is able to recheck.

Args:
backend (str): The backend info. 'Ascend' or 'GPU'. Default: None.

Returns:
bool, if enable to recheck.
"""
enable_recheck = bool(self._updated_watchpoints or self._deleted_watchpoints)
if backend == 'GPU' and enable_recheck:
# on GPU, disable to recheck if there are new watched node of which the tensor
# has not been stored on MindSpore
diff_set = self._new_watched_node_full_names - self._all_watched_node_full_names
enable_recheck = not diff_set or diff_set.issubset(self._temp_cached_node_full_names)
return enable_recheck

def set_watch_nodes(self, graph, graph_stream, watch_point_id, graph_name=None):
"""
set watch nodes for graph.

@@ -96,54 +192,88 @@ class WatchpointHandler(StreamHandlerBase):
graph (dict): The graph with list of nodes.
graph_stream (GraphHandler): The graph handler.
watch_point_id (int): The id of watchpoint.
graph_name (str): The graph name.
"""
if not (watch_point_id and graph):
return
log.debug("add watch flags")
watchpoint = self._watchpoints.get(watch_point_id)
self._set_watch_status_recursively(graph, graph_stream, watchpoint)
self._set_watch_status_recursively(graph, graph_stream, watchpoint, graph_name)

def _set_watch_status_recursively(self, graph, graph_stream, watchpoint):
def _set_watch_status_recursively(self, graph, graph_stream, watchpoint, graph_name=None):
"""Set watch status to graph."""
if not isinstance(graph, dict):
log.warning("The graph is not dict.")
return
if graph.get('children'):
self._set_watch_status_recursively(graph.get('children'), graph_stream, watchpoint)
self._set_watch_status_recursively(
graph.get('children'), graph_stream, watchpoint, graph_name)

for node in graph.get('nodes', []):
if not isinstance(node, dict):
log.warning("The node is not dict.")
return
if graph.get('nodes'):
_ = self._set_watch_state_for_nodes(graph['nodes'], graph_stream, watchpoint, graph_name)

def _set_watch_state_for_nodes(self, nodes, graph_stream, watchpoint, graph_name):
"""
Set watch state for nodes.

Args:
nodes (list[Node]): List of node info.

Returns:
int, the number of all watched nodes.
"""
all_watched_num = 0
for node in nodes:
node_name = node.get('name')
if not node_name:
continue
full_name = graph_stream.get_full_name(node_name)
flag = watchpoint.get_node_status(node_name, node.get('type'), full_name)
node['watched'] = flag
# search result could have `nodes` in nodes object
if node.get('nodes'):
self._set_watch_status_recursively(node, graph_stream, watchpoint)
flag = self._set_watch_state_for_nodes(node.get('nodes'), graph_stream, watchpoint, graph_name)
else:
full_name = graph_stream.get_full_name(node_name, graph_name)
new_node_name = node_name if graph_name is None else '/'.join([graph_name, node_name])
flag = watchpoint.get_node_status(new_node_name, node.get('type'), full_name)
node['watched'] = flag
if flag == WatchNodeTree.TOTAL_WATCH:
all_watched_num += 1

# calculate the state of current node.
if not all_watched_num:
state = WatchNodeTree.NOT_WATCH
elif all_watched_num == len(nodes):
state = WatchNodeTree.TOTAL_WATCH
else:
state = WatchNodeTree.PARTIAL_WATCH

return state

def create_watchpoint(self, watch_condition, watch_nodes=None, watch_point_id=None):
def create_watchpoint(self, condition_mgr, watch_condition, watch_nodes=None, watch_point_id=None):
"""
Create watchpoint.
Args:
condition_mgr (ConditionMgr): Instance of ConditionMgr.
watch_condition (dict): The watch condition.

- condition (str): Accept `INF` or `NAN`.

- param (list[float]): Not defined yet.
"condition": {
id: "tensor_too_large",
"params": [
{
"name": "abs_mean_gt",
"disable": false,
"value": 1.1
}
]
}
- id (str): Id of condition.
- param (list[dict]): The list of param for this condition.
watch_nodes (list[NodeBasicInfo]): The list of node basic info.
watch_point_id (int): The id of watchpoint.

Returns:
int, the new id of watchpoint.
"""
validate_watch_condition(watch_condition)
validate_watch_condition(condition_mgr, watch_condition)
watch_condition = set_default_param(condition_mgr, watch_condition)
new_id = self._latest_id + 1
watchpoint = Watchpoint(new_id, watch_condition)
if watch_nodes:
watchpoint.add_nodes(watch_nodes)
self._add_watch_node_in_cache(watch_nodes)
elif watch_point_id:
self.validate_watchpoint_id(watch_point_id)
watchpoint.copy_nodes_from(self._watchpoints.get(watch_point_id))
@@ -157,34 +287,51 @@ class WatchpointHandler(StreamHandlerBase):

Args:
watch_point_id (int): The id of watchpoint.
watch_nodes (list[str]): The list of node names.
watch_nodes (list[NodeBasicInfo]): The list of node basic info.
watched (bool): The update operator on nodes. If False, remove nodes from watch nodes.
If True, add nodes to watch nodes. Default: False.

Returns:
dict, empty response.
"""
self.validate_watchpoint_id(watch_point_id)
watchpoint = self._watchpoints.get(watch_point_id)
if watched:
watchpoint.add_nodes(watch_nodes)
self._add_watch_node_in_cache(watch_nodes)
else:
watchpoint.remove_nodes(watch_nodes)
self._remove_watch_node_from_cache(watch_nodes)
self._updated_watchpoints[watch_point_id] = watchpoint
log.debug("Update watchpoint %d in cache.", watch_point_id)

def delete_watchpoint(self, watch_point_id):
def delete_watchpoint(self, watch_point_id=None):
"""
Delete watchpoint.

Args:
watch_point_id (int): The id of watchpoint.
watch_point_id (Union[None, int]): The id of watchpoint.
If None, delete all watchpoints. Default: None.
"""
if watch_point_id is None:
watch_point_ids = [sub_id for sub_id, _ in self._watchpoints.items()]
else:
self.validate_watchpoint_id(watch_point_id)
watch_point_ids = [watch_point_id]
for single_id in watch_point_ids:
self._delete_single_watchpoint(single_id)

Returns:
dict, empty response.
def _delete_single_watchpoint(self, watch_point_id):
"""
Delete single watchpoint.

Args:
watch_point_id (int): The id of watchpoint.
"""
self.validate_watchpoint_id(watch_point_id)
self._watchpoints.pop(watch_point_id)
# if the watchpoint has not been created by MindSpore, clean the relative cache directly
if watch_point_id in self._created_watchpoints:
self._created_watchpoints.remove(watch_point_id)
self._updated_watchpoints.pop(watch_point_id)
log.debug("Cancel create watchpoint %d in cache.", watch_point_id)
return
set_cmd = SetCMD()
set_cmd.id = watch_point_id
set_cmd.delete = True
@@ -200,11 +347,33 @@ class WatchpointHandler(StreamHandlerBase):
log.error("Invalid watchpoint id: %d.", watch_point_id)
raise DebuggerParamValueError("Invalid watchpoint id: {}".format(watch_point_id))

def _add_watch_node_in_cache(self, watch_nodes):
"""
Add watch nodes in cache.

Args:
watch_nodes (list[NodeBasicInfo]): The list of node basic info.
"""
node_full_names = [node.full_name for node in watch_nodes]
self._new_watched_node_full_names.update(node_full_names)

def _remove_watch_node_from_cache(self, watch_nodes):
"""
Remove watch nodes from cache.

Args:
watch_nodes (list[NodeBasicInfo]): The list of node basic info.
"""
for node in watch_nodes:
if node.full_name in self._new_watched_node_full_names:
self._new_watched_node_full_names.remove(node.full_name)


class WatchpointHitHandler(StreamHandlerBase):
"""Watchpoint hit handler."""

def __init__(self):
# dict of <ui node_name, dict of <slot, WatchpointHit>>,
self._hits = {}

@property
@@ -224,20 +393,41 @@ class WatchpointHitHandler(StreamHandlerBase):
- watchpoint (Watchpoint): The Watchpoint that a node hit.

- node_name (str): The UI node name.

- graph_name (str): The graph name.
"""
watchpoint_hit = WatchpointHit(
tensor_proto=value.get('tensor_proto'),
watchpoint=value.get('watchpoint'),
node_name=value.get('node_name')
node_name=value.get('node_name'),
graph_name=value.get('graph_name')
)
# get all hit watchpoints according to node name ans tensor slot
watchpoint_hits = self._get_watchpoints_by_tensor_name(watchpoint_hit.node_name,
watchpoint_hit.slot)
if watchpoint_hit not in watchpoint_hits:
watchpoint_hits.append(watchpoint_hit)

def _get_watchpoints_by_tensor_name(self, node_name, slot):
"""
Get hit tensors according to ui node name and slot.

node_name = value.get('node_name')
hit_tensors = self._hits.get(node_name)
Args:
node_name (str): The node name.
slot (str): The tensor slot.

Returns:
list, list of watchpoints.
"""
hit_node = self._hits.get(node_name)
if hit_node is None:
hit_node = {}
self._hits[node_name] = hit_node
hit_tensors = hit_node.get(slot)
if hit_tensors is None:
hit_tensors = []
self._hits[node_name] = hit_tensors
if watchpoint_hit not in hit_tensors:
hit_tensors.append(watchpoint_hit)
hit_node[slot] = hit_tensors
return hit_tensors

def get(self, filter_condition=None):
"""
@@ -263,34 +453,55 @@ class WatchpointHitHandler(StreamHandlerBase):
"""Return the list of watchpoint hits."""
watch_point_hits = []
for node_name, watchpoint_hits in self._hits.items():
watch_points = [watchpoint_hit.watchpoint for watchpoint_hit in watchpoint_hits]
tensors = []
graph_name = None
for slot, tensor_hits in watchpoint_hits.items():
if graph_name is None:
graph_name = tensor_hits[0].graph_name
tensor_info = self._get_tensor_hit_info(slot, tensor_hits)
tensors.append(tensor_info)
watch_point_hits.append({
'node_name': node_name,
'watch_points': watch_points
'tensors': tensors,
'graph_name': graph_name
})

return {'watch_point_hits': watch_point_hits}

@staticmethod
def _get_tensor_hit_info(slot, tensor_hits):
"""
Get watchpoint hit info of specified tensor.

Args:
slot (str): Slot id.
tensor_hits (list): A list of watchpoint hit objects that the tensor hit.

Returns:
dict, tensor hit info.
"""
res = {}
watch_points = [tensor_hit.watchpoint for tensor_hit in tensor_hits]
if watch_points:
res = {
'slot': slot,
'watch_points': watch_points
}
return res

def _is_tensor_hit(self, tensor_name):
"""
Check if the tensor is record in hit cache.

Args:
tensor_name (str): The name of full tensor name.
tensor_name (str): The name of ui tensor name.

Returns:
bool, if the tensor is hit.
"""
node_name = tensor_name.split(':')[0]
watchpoint_hits = self.get(node_name)
if watchpoint_hits is None:
return False

for watchpoint_hit in watchpoint_hits:
if tensor_name == watchpoint_hit.tensor_name:
return True

return False
node_name, slot = tensor_name.rsplit(':', 1)
watchpoint_hits = self._hits.get(node_name, {}).get(slot)
return bool(watchpoint_hits)

def update_tensor_history(self, tensor_history):
"""
@@ -308,45 +519,109 @@ class WatchpointHitHandler(StreamHandlerBase):
hit_flag = self._is_tensor_hit(tensor_name)
tensor_info['is_hit'] = hit_flag

def get_tensor_hit_infos(self, tensor_name):
"""
Get all hit information of a tensor.

Args:
tensor_name (str): Tensor name showed on UI.

def validate_watch_condition(watch_condition):
Returns:
dict, tensor hit info.
"""
tensor_hit_info = {}
if self._is_tensor_hit(tensor_name):
node_name, slot = tensor_name.rsplit(':', 1)
tensor_hits = self._get_watchpoints_by_tensor_name(node_name, slot)
tensor_hit_info = self._get_tensor_hit_info(slot, tensor_hits)
return tensor_hit_info


def validate_watch_condition(condition_mgr, watch_condition):
"""Validate watch condition."""
if not isinstance(watch_condition, dict):
log.error("<watch_condition> should be dict. %s received.", watch_condition)
raise DebuggerParamTypeError("<watch_condition> should be dict.")
# validate condition
condition = watch_condition.get('condition')
if condition not in WATCHPOINT_CONDITION_MAPPING.keys():
log.error("Invalid watch condition. Acceptable values are <%s>.",
str(WATCHPOINT_CONDITION_MAPPING.keys()))
# validate condition_id
condition_id = watch_condition.get('id')
if condition_id not in condition_mgr.conditions.keys():
log.error("Invalid watch condition. Acceptable values are <%s>. %s received.",
str(condition_mgr.conditions.keys()), condition_id)
raise DebuggerParamValueError("Invalid watch condition value.")
# validate param
validate_watch_condition_params(watch_condition)
validate_watch_condition_params(condition_mgr, watch_condition)


def validate_watch_condition_params(watch_condition):
def validate_watch_condition_params(condition_mgr, watch_condition):
"""
Validate watch condition parameters.

Args:
condition_mgr (ConditionMgr): Instance of ConditionMgr.
watch_condition (dict): Watch condition.

- condition (str): Condition type. Should be in WATCHPOINT_CONDITION_MAPPING.
- id (str): Condition id. Should be in WATCHPOINT_CONDITION_MAPPING.

- param (list): Condition value. Should be given for comparison condition. The value will
be translated to np.float32.
- param (list): Condition value. Should be given for comparison condition. The value
will be translated to np.float32.
"""
condition = watch_condition.get('condition')
param = watch_condition.get('param')
if condition in ['NAN', 'INF', 'OVERFLOW']:
if param:
log.error("No param is expected for %s condition.", condition)
condition_id = watch_condition.get('id')
params = watch_condition.get('params')
condition = condition_mgr.get_condition(condition_id)
if condition_id in condition_mgr.get_no_param_condition():
if params:
log.error("No param is expected for %s condition", condition_id)
raise DebuggerParamValueError("No param is expected.")
else:
if not isinstance(param, (float, int)):
log.error("Number param should be given for condition <%s>.",
condition)
return

for param in params:
if param.get("name") not in condition.names:
log.error("Invalid name of parameter for condition: %s, available values: %s",
condition_id, condition.names)
raise DebuggerParamValueError("Invalid name of parameter.")

condition_param = condition.get_parameter_definition(param.get("name"))
if condition_param.type.name in (ValueTypeEnum.FLOAT64.name, ValueTypeEnum.INT64.name) \
and not isinstance(param.get("value"), (float, int)):
log.error("Number param should be given for condition: %s", condition_id)
raise DebuggerParamValueError("Number param should be given.")
if np.isinf(np.float32(param)):
log.error("Condition param should be float32.")
raise DebuggerParamValueError("The value of condition param should be within float32.")

if condition_param.type.name == ValueTypeEnum.BOOL.name \
and not isinstance(param.get("value"), bool):
log.error("Bool param should be given for condition: %s", condition_id)
raise DebuggerParamValueError("Bool param should be given.")


def set_default_param(condition_mgr, watch_condition):
"""
Set default param.
Args:
condition_mgr (ConditionMgr): Instance of ConditionMgr.
watch_condition (dict): The watch condition.
"condition": {
id: "tensor_too_large",
"params": [
{
"name": "abs_mean_gt",
"disable": false,
"value": 1.1
}
]
}
- id (str): Id of condition.
- param (list[dict]): The list of param for this condition.

Returns:
dict, the new watch_condition.
"""
condition_id = watch_condition.get('id')
condition = condition_mgr.get_condition(condition_id)
for param in condition.parameters:
if not param.visible_on_ui and not param.support_disable:
watch_condition["params"].append({
"name": param.name,
"disable": False,
"value": param.default_value
})
watch_condition["abbr"] = condition.abbr
return watch_condition

+ 15
- 0
mindinsight/debugger/stream_operator/__init__.py View File

@@ -0,0 +1,15 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""This package contains operators using multiple streams to deal with specific task."""

+ 120
- 0
mindinsight/debugger/stream_operator/tensor_detail_info.py View File

@@ -0,0 +1,120 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""This module is aimed to provide with tensor detail info."""
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.common.utils import Streams


class TensorDetailInfo:
"""Manage tensor detail information."""

def __init__(self, cache):
self._tensor_stream = cache.get_stream_handler(Streams.TENSOR)
self._graph_stream = cache.get_stream_handler(Streams.GRAPH)
self._hit_stream = cache.get_stream_handler(Streams.WATCHPOINT_HIT)

def validate_tensor_name(self, tensor_name, graph_name):
"""
Get the graph id of the tensor.

Args:
tensor_name (str): The tensor name on UI.
graph_name (str): The graph name.
"""
# validate tensor name format
if not isinstance(tensor_name, str) or ':' not in tensor_name:
log.error("Invalid tensor name. Received: %s", tensor_name)
raise DebuggerParamValueError("Invalid tensor name.")
node_name, _ = tensor_name.rsplit(':', 1)
# check if the node name is in graph
self._graph_stream.validate_node_name(node_name=node_name, graph_name=graph_name)

def get_tensor_graph(self, tensor_name, graph_name):
"""
Get the graph related to specific tensor.

Args:
tensor_name (str): The name of tensor. Format like {node_name}:{slot}.
graph_name (str): The graph name.

Returns:
dict, tensor graph, format is {'nodes': [Node object]}.
The Node object = {
'graph_name': <graph_name>,
'name': <node name>,
'input': {<node name>: <Edge object>},
'output: {<node name>: <Edge object>},
'slots': [<Slot object>].
}
Edge object = {
'data_type': <data type>,
'edge_type': <edge type>,
'independent_layout': bool,
'shape': list[<dim>],
'slot_mapping': list[pair<slot, slot>],
}.
"""
self.validate_tensor_name(tensor_name=tensor_name, graph_name=graph_name)
graph = self._graph_stream.get_tensor_graph(tensor_name, graph_name)
# add watchpoint hits info and statistics info for each tensor in tensor graph.
nodes = graph.get('graph', {}).get('nodes', [])
for node in nodes:
node['graph_name'] = graph_name
for slot_info in node.get('slots', []):
self._add_watchpoint_hit_info(slot_info, node)
self._add_statistic_info(slot_info, node)
return graph

def _add_watchpoint_hit_info(self, slot_info, node):
"""
Get the watchpoint that the tensor hit.

Args:
slot_info (dict): Slot object.
node (dict): Node object.
"""
tensor_name = ':'.join([node.get('name'), slot_info.get('slot')])
slot_info.update(self._hit_stream.get_tensor_hit_infos(tensor_name))

def _add_statistic_info(self, slot_info, node):
"""
Get the watchpoint that the tensor hit.

Args:
slot_info (dict): Slot object.
node (dict): Node object.
"""
tensor_name = ':'.join([node.get('full_name'), slot_info.get('slot')])
node_type = node.get('type')
slot_info['statistics'] = self._tensor_stream.get_tensor_statistics(tensor_name, node_type)

def get_tensor_watch_points(self, tensor_name, graph_name):
"""
Get all watchpoints that the tensor hit.

Args:
tensor_name (str): Tensor name from UI.
graph_name (str): The graph name.

Returns:
list, watchpoint hit infos.
"""
# validate tensor_name
self.validate_tensor_name(tensor_name=tensor_name, graph_name=graph_name)
# get watchpoint info that the tensor hit
tensor_hit_info = self._hit_stream.get_tensor_hit_infos(tensor_name)
watch_points = tensor_hit_info.get('watch_points', [])
return watch_points

+ 61
- 6
mindinsight/utils/tensor.py View File

@@ -32,16 +32,23 @@ class Statistics:
avg_value (float): avg value of tensor data.
count (int): total count of tensor data.
nan_count (int): count of NAN.
neg_zero_count (int): count of negative zero.
pos_zero_count (int): count of positive zero.
zero_count (int): count of zero.
neg_inf_count (int): count of negative INF.
pos_inf_count (int): count of positive INF.
"""

def __init__(self, max_value=0, min_value=0, avg_value=0,
count=0, nan_count=0, neg_inf_count=0, pos_inf_count=0):
def __init__(self, max_value=0, min_value=0, avg_value=0, count=0,
neg_zero_count=0, pos_zero_count=0, zero_count=0,
nan_count=0, neg_inf_count=0, pos_inf_count=0):
self._max = max_value
self._min = min_value
self._avg = avg_value
self._count = count
self._neg_zero_count = neg_zero_count
self._pos_zero_count = pos_zero_count
self._zero_count = zero_count
self._nan_count = nan_count
self._neg_inf_count = neg_inf_count
self._pos_inf_count = pos_inf_count
@@ -81,6 +88,21 @@ class Statistics:
"""Get count of positive INF."""
return self._pos_inf_count

@property
def neg_zero_count(self):
"""Get count of negative zero."""
return self._neg_zero_count

@property
def pos_zero_count(self):
"""Get count of positive zero."""
return self._pos_zero_count

@property
def zero_count(self):
"""Get count of zero."""
return self._zero_count

class TensorComparison:
"""TensorComparison class.

@@ -204,7 +226,7 @@ class TensorUtils:
tensors (numpy.ndarray): An numpy.ndarray of tensor data.

Returns:
an instance of Statistics.
Statistics, an instance of Statistics.
"""
ma_value = np.ma.masked_invalid(tensors)
total, valid = tensors.size, ma_value.count()
@@ -240,10 +262,19 @@ class TensorUtils:
tensor_min = ma_value.min()
tensor_max = ma_value.max()
tensor_sum = ma_value.sum(dtype=np.float64)
with np.errstate(invalid='ignore'):
neg_zero_count = np.sum(ma_value < 0)
with np.errstate(invalid='ignore'):
pos_zero_count = np.sum(ma_value > 0)
with np.errstate(invalid='ignore'):
zero_count = np.sum(ma_value == 0)
statistics = Statistics(max_value=tensor_max,
min_value=tensor_min,
avg_value=tensor_sum / valid,
count=total,
neg_zero_count=neg_zero_count,
pos_zero_count=pos_zero_count,
zero_count=zero_count,
nan_count=nan_count,
neg_inf_count=neg_inf_count,
pos_inf_count=pos_inf_count)
@@ -269,11 +300,35 @@ class TensorUtils:
"count": stats.count,
"nan_count": stats.nan_count,
"neg_inf_count": stats.neg_inf_count,
"pos_inf_count": stats.pos_inf_count,
"pos_inf_count": stats.pos_inf_count}
overall_statistics = TensorUtils.get_overall_statistic_dict(overall_stats)
statistics.update(overall_statistics)
return statistics

@staticmethod
def get_overall_statistic_dict(overall_stats):
"""
Get overall statistics dict according to statistics value.

Args:
overall_stats (Statistics): An instance of Statistics for whole tensor.

Returns:
dict, overall statistics.
"""
res = {
"overall_max": float(overall_stats.max),
"overall_min": float(overall_stats.min)
"overall_min": float(overall_stats.min),
"overall_avg": float(overall_stats.avg),
"overall_count": overall_stats.count,
"overall_nan_count": overall_stats.nan_count,
"overall_neg_inf_count": overall_stats.neg_inf_count,
"overall_pos_inf_count": overall_stats.pos_inf_count,
"overall_zero_count": float(overall_stats.zero_count),
"overall_neg_zero_count": float(overall_stats.neg_zero_count),
"overall_pos_zero_count": float(overall_stats.pos_zero_count)
}
return statistics
return res

@staticmethod
def calc_diff_between_two_tensor(first_tensor, second_tensor, tolerance):


+ 8
- 2
tests/st/func/debugger/conftest.py View File

@@ -51,8 +51,9 @@ def init_graph_handler():
@pytest.fixture(scope='session')
def app_client():
"""This fixture is flask server."""
packages = ["mindinsight.backend.debugger"]
packages = ["mindinsight.backend.debugger", "mindinsight.backend.conditionmgr"]
settings.ENABLE_DEBUGGER = True

mock_obj = Mock(return_value=packages)
tools.find_app_package = mock_obj

@@ -60,5 +61,10 @@ def app_client():
from mindinsight.backend.debugger.debugger_api import BACKEND_SERVER
APP.response_class = Response
client = APP.test_client()
yield client
original_val = settings.ENABLE_RECOMMENDED_WATCHPOINTS
settings.ENABLE_RECOMMENDED_WATCHPOINTS = False
try:
yield client
finally:
settings.ENABLE_RECOMMENDED_WATCHPOINTS = original_val
BACKEND_SERVER.stop()

+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/before_train_begin.json View File

@@ -1 +1 @@
{"metadata": {"state": "pending", "step": 0, "device_name": "", "ip": "", "node_name": "", "backend": ""}}
{"metadata": {"state": "pending", "step": 0, "device_name": "", "ip": "", "node_name": "", "backend": "", "enable_recheck": false, "graph_name": ""}}

+ 75
- 56
tests/st/func/debugger/expect_results/restful_results/compare_tensors.json View File

@@ -1,58 +1,77 @@
{
"tensor_value": {
"full_name": "Default/args0:0",
"step": 3,
"dtype": "DT_FLOAT32",
"shape": [
2,
3
],
"diff": [
[
[
1.0,
1.0,
0.0
],
[
2.0,
2.0,
0.0
],
[
3.0,
3.0,
0.0
]
],
[
[
4.0,
4.0,
0.0
],
[
5.0,
5.0,
0.0
],
[
6.0,
6.0,
0.0
]
]
],
"statistics": {
"max": 0.0,
"min": 0.0,
"avg": 0.0,
"count": 6,
"nan_count": 0,
"neg_inf_count": 0,
"pos_inf_count": 0,
"overall_max": 0.0,
"overall_min": 0.0
}
}
"tensor_value": {
"full_name": "Default/args0:0",
"step": 3,
"dtype": "DT_FLOAT32",
"shape": [2, 3],
"diff": [
[
[1.0, 1.0, 0.0],
[2.0, 2.0, 0.0],
[3.0, 3.0, 0.0]
],
[
[4.0, 4.0, 0.0],
[5.0, 5.0, 0.0],
[6.0, 6.0, 0.0]
]
],
"curr_step_statistics": {
"max": 6.0,
"min": 1.0,
"avg": 3.5,
"count": 6,
"nan_count": 0,
"neg_inf_count": 0,
"pos_inf_count": 0,
"overall_max": 6.0,
"overall_min": 1.0,
"overall_avg": 3.5,
"overall_count": 6,
"overall_nan_count": 0,
"overall_neg_inf_count": 0,
"overall_pos_inf_count": 0,
"overall_zero_count": 0.0,
"overall_neg_zero_count": 0.0,
"overall_pos_zero_count": 6.0
},
"prev_step_statistics": {
"max": 6.0,
"min": 1.0,
"avg": 3.5,
"count": 6,
"nan_count": 0,
"neg_inf_count": 0,
"pos_inf_count": 0,
"overall_max": 6.0,
"overall_min": 1.0,
"overall_avg": 3.5,
"overall_count": 6,
"overall_nan_count": 0,
"overall_neg_inf_count": 0,
"overall_pos_inf_count": 0,
"overall_zero_count": 0.0,
"overall_neg_zero_count": 0.0,
"overall_pos_zero_count": 6.0
},
"statistics": {
"max": 0.0,
"min": 0.0,
"avg": 0.0,
"count": 6,
"nan_count": 0,
"neg_inf_count": 0,
"pos_inf_count": 0,
"overall_max": 0.0,
"overall_min": 0.0,
"overall_avg": 0.0,
"overall_count": 6,
"overall_nan_count": 0,
"overall_neg_inf_count": 0,
"overall_pos_inf_count": 0,
"overall_zero_count": 6.0,
"overall_neg_zero_count": 0.0,
"overall_pos_zero_count": 0.0
}
}
}

+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/create_and_delete_watchpoint.json View File

@@ -1 +1 @@
{"watch_points": [{"id": 1, "watch_condition": {"condition": "MAX_GT", "param": 1.0}}, {"id": 2, "watch_condition": {"condition": "MAX_LT", "param": -1.0}}, {"id": 3, "watch_condition": {"condition": "MIN_GT", "param": 1e+32}}, {"id": 5, "watch_condition": {"condition": "MAX_MIN_GT", "param": 0}}, {"id": 6, "watch_condition": {"condition": "MAX_MIN_LT", "param": 0}}, {"id": 7, "watch_condition": {"condition": "MEAN_GT", "param": 0}}, {"id": 8, "watch_condition": {"condition": "MEAN_LT", "param": 0}}, {"id": 9, "watch_condition": {"condition": "INF"}}, {"id": 10, "watch_condition": {"condition": "OVERFLOW"}}]}
{"watch_points": [{"id": 1, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1.0, "disable": false}], "abbr": "MAX>"}}, {"id": 2, "watch_condition": {"id": "max_lt", "params": [{"name": "param", "value": -1.0, "disable": false}], "abbr": "MAX<"}}, {"id": 3, "watch_condition": {"id": "min_gt", "params": [{"name": "param", "value": 1e+32, "disable": false}], "abbr": "MIN>"}}, {"id": 5, "watch_condition": {"id": "max_min_gt", "params": [{"name": "param", "value": 0, "disable": false}], "abbr": "MAX-MIN>"}}, {"id": 6, "watch_condition": {"id": "max_min_lt", "params": [{"name": "param", "value": 0, "disable": false}], "abbr": "MAX-Min<"}}, {"id": 7, "watch_condition": {"id": "mean_gt", "params": [{"name": "param", "value": 0, "disable": false}], "abbr": "MEAN>"}}, {"id": 8, "watch_condition": {"id": "mean_lt", "params": [{"name": "param", "value": 0, "disable": false}], "abbr": "MEAN<"}}, {"id": 9, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 10, "watch_condition": {"id": "overflow", "params": [], "abbr": "OVERFLOW"}}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/get_conditions_for_ascend.json View File

@@ -0,0 +1 @@
{"conditions": [{"id": "inf", "parameters": [], "supported_target_type": "TENSOR"}, {"id": "max_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "max_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "max_min_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "max_min_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "mean_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "mean_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "min_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "min_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "overflow", "parameters": [], "supported_target_type": "TENSOR"}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/get_conditions_for_gpu.json View File

@@ -0,0 +1 @@
{"conditions": [{"id": "inf", "parameters": [], "supported_target_type": "TENSOR"}, {"id": "max_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "max_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "max_min_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "max_min_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "mean_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "mean_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "min_gt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "min_lt", "parameters": [{"name": "param", "type": "FLOAT64", "support_disable": true, "default_value": null}], "supported_target_type": "TENSOR"}, {"id": "nan", "parameters": [], "supported_target_type": "TENSOR"}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/multi_next_node.json View File

@@ -0,0 +1 @@
{"metadata": {"state": "waiting", "step": 1, "device_name": "0", "node_name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0", "backend": "GPU", "enable_recheck": false, "graph_name": "graph_1"}, "graph": {"graph_names": ["graph_0", "graph_1"], "nodes": [{"name": "graph_0", "type": "name_scope", "attr": {}, "input": {}, "output": {}, "output_i": 0, "proxy_input": {}, "proxy_output": {}, "subnode_count": 2, "independent_layout": false}, {"name": "graph_1", "type": "name_scope", "attr": {}, "input": {}, "output": {}, "output_i": 0, "proxy_input": {}, "proxy_output": {}, "subnode_count": 2, "independent_layout": false}]}, "watch_points": []}

+ 672
- 0
tests/st/func/debugger/expect_results/restful_results/multi_retrieve_aggregation_scope_node.json View File

@@ -0,0 +1,672 @@
{
"graph": {
"nodes": [
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/fc3.bias",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[10]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22": {
"shape": [
[
10
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[10]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22": {
"shape": [
[
10
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/learning_rate",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op30": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op33": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op38": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op41": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op49": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op56": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/momentum",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op30": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op33": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op38": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op41": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op49": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op56": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/fc3.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[10, 84]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25": {
"shape": [
[
10,
84
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[10, 84]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25": {
"shape": [
[
10,
84
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/fc2.bias",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[84]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op30": {
"shape": [
[
84
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.fc2.bias",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[84]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op30": {
"shape": [
[
84
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/fc2.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[84, 120]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op33": {
"shape": [
[
84,
120
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.fc2.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[84, 120]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op33": {
"shape": [
[
84,
120
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/fc1.bias",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[120]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op38": {
"shape": [
[
120
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[120]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op38": {
"shape": [
[
120
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/fc1.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[120, 400]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op41": {
"shape": [
[
120,
400
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[120, 400]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op41": {
"shape": [
[
120,
400
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/conv2.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[16, 6, 5, 5]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op49": {
"shape": [
[
16,
6,
5,
5
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.conv2.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[16, 6, 5, 5]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op49": {
"shape": [
[
16,
6,
5,
5
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/conv1.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[6, 1, 5, 5]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op56": {
"shape": [
[
6,
1,
5,
5
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
},
{
"name": "graph_0/Default/optimizer-Momentum/Parameter[18]_7/moments.conv1.weight",
"type": "Parameter",
"attr": {
"type": "DT_TENSOR[DT_FLOAT32]",
"shape": "[[6, 1, 5, 5]]"
},
"input": {},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op56": {
"shape": [
[
6,
1,
5,
5
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1": {
"edge_type": "data"
}
},
"subnode_count": 0,
"independent_layout": true
}
]
}
}

+ 44
- 0
tests/st/func/debugger/expect_results/restful_results/multi_retrieve_all.json View File

@@ -0,0 +1,44 @@
{
"metadata": {
"state": "waiting",
"step": 1,
"device_name": "0",
"node_name": "",
"backend": "Ascend",
"enable_recheck": false,
"graph_name": ""
},
"graph": {
"graph_names": [
"graph_0",
"graph_1"
],
"nodes": [
{
"name": "graph_0",
"type": "name_scope",
"attr": {},
"input": {},
"output": {},
"output_i": 0,
"proxy_input": {},
"proxy_output": {},
"subnode_count": 2,
"independent_layout": false
},
{
"name": "graph_1",
"type": "name_scope",
"attr": {},
"input": {},
"output": {},
"output_i": 0,
"proxy_input": {},
"proxy_output": {},
"subnode_count": 2,
"independent_layout": false
}
]
},
"watch_points": []
}

+ 534
- 0
tests/st/func/debugger/expect_results/restful_results/multi_retrieve_scope_node.json View File

@@ -0,0 +1,534 @@
{
"graph": {
"nodes": [
{
"name": "graph_0/Default",
"type": "name_scope",
"attr": {},
"input": {
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op21": {
"shape": [
[
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op24": {
"shape": [
[
10,
84
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op29": {
"shape": [
[
84
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op32": {
"shape": [
[
84,
120
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op37": {
"shape": [
[
120
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op40": {
"shape": [
[
120,
400
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/gradConv2D/Conv2DBackpropFilter-op48": {
"shape": [
[
16,
6,
5,
5
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/gradConv2D/Conv2DBackpropFilter-op55": {
"shape": [
[
6,
1,
5,
5
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output": {
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/gradConv2D/Conv2DBackpropInput-op52": {
"shape": [
[
16,
6,
5,
5
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/gradConv2D/Conv2DBackpropFilter-op55": {
"shape": [
[
32,
1,
32,
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/gradMaxPoolWithArgmax/MaxPoolGradWithArgmax-op53": {
"shape": [
[
32,
6,
4,
14
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT16]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/gradMaxPoolWithArgmax/MaxPoolGradWithArgmax-op46": {
"shape": [
[
32,
16,
4,
3
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT16]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op40": {
"shape": [
[
32,
400
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGrad-op36": {
"shape": [
[
32,
120
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op32": {
"shape": [
[
32,
120
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGrad-op28": {
"shape": [
[
32,
84
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op24": {
"shape": [
[
32,
84
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/gradConv2D/Conv2DBackpropFilter-op48": {
"shape": [
[
32,
6,
14,
14
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/gradSoftmaxCrossEntropyWithLogits/Mul-op20": {
"shape": [
[
32,
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92": {
"shape": [
[
32,
1,
10,
10,
2
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT8]"
},
"graph_0/Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op94": {
"shape": [
[
32,
1,
28,
28,
2
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT8]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {},
"subnode_count": 7,
"independent_layout": false
},
{
"name": "graph_0/Gradients",
"type": "name_scope",
"attr": {},
"input": {
"graph_0/Default/tuple_getitem[10]_0/tuple_getitem-op210": {
"shape": [
[
32,
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op15": {
"shape": [
[
32,
84
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op12": {
"shape": [
[
32,
120
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/Cast-op205": {
"shape": [
[
32,
16,
10,
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/tuple_getitem[10]_0/tuple_getitem-op206": {
"shape": [
[
32,
16,
4,
3
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT16]"
},
"graph_0/Default/tuple_getitem[10]_0/tuple_getitem-op202": {
"shape": [
[
32,
1,
10,
10,
2
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT8]"
},
"graph_0/Default/tuple_getitem[10]_0/tuple_getitem-op197": {
"shape": [
[
32,
6,
14,
14
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/Cast-op188": {
"shape": [
[
16,
6,
5,
5
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/Cast-op195": {
"shape": [
[
32,
6,
28,
28
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/tuple_getitem[10]_0/tuple_getitem-op196": {
"shape": [
[
32,
6,
4,
14
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT16]"
},
"graph_0/Default/tuple_getitem[10]_0/tuple_getitem-op192": {
"shape": [
[
32,
1,
28,
28,
2
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT8]"
},
"graph_0/Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {
"shape": [
[
32,
1,
32,
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/network-WithLossCell/_backbone-LeNet5/flatten-Flatten/Reshape-op9": {
"shape": [
[
32,
400
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output": {
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22": {
"shape": [
[
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op30": {
"shape": [
[
84
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op38": {
"shape": [
[
120
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op49": {
"shape": [
[
16,
6,
5,
5
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op56": {
"shape": [
[
6,
1,
5,
5
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25": {
"shape": [
[
10,
84
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op33": {
"shape": [
[
84,
120
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
},
"graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op41": {
"shape": [
[
120,
400
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]"
}
},
"output_i": 0,
"proxy_input": {},
"proxy_output": {},
"subnode_count": 1,
"independent_layout": false
}
]
}
}

+ 1735
- 0
tests/st/func/debugger/expect_results/restful_results/multi_retrieve_single_node.json
File diff suppressed because it is too large
View File


+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/multi_retrieve_watchpoint.json View File

@@ -0,0 +1 @@
{"watch_points": [{"id": 1, "watch_condition": {"id": "overflow", "params": [], "abbr": "OVERFLOW"}}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/multi_run_steps.json View File

@@ -0,0 +1 @@
{"metadata": {"state": "waiting", "step": 2, "device_name": "0", "node_name": "", "backend": "GPU", "enable_recheck": false, "graph_name": ""}, "graph": {"graph_names": ["graph_0", "graph_1"], "nodes": [{"name": "graph_0", "type": "name_scope", "attr": {}, "input": {}, "output": {}, "output_i": 0, "proxy_input": {}, "proxy_output": {}, "subnode_count": 2, "independent_layout": false}, {"name": "graph_1", "type": "name_scope", "attr": {}, "input": {}, "output": {}, "output_i": 0, "proxy_input": {}, "proxy_output": {}, "subnode_count": 2, "independent_layout": false}]}, "watch_points": [{"id": 1, "watch_condition": {"id": "weight_initialization", "params": [{"name": "zero_percentage_ge", "disable": false, "value": 100}], "abbr": "WI"}}, {"id": 2, "watch_condition": {"id": "weight_change_too_large", "params": [{"name": "abs_update_ratio_mean_gt", "disable": false, "value": 0.1}], "abbr": "WCL"}}, {"id": 3, "watch_condition": {"id": "gradient_vanishing", "params": [{"name": "abs_mean_lt", "disable": false, "value": 1e-09}], "abbr": "GV"}}, {"id": 4, "watch_condition": {"id": "tensor_overflow", "params": [], "abbr": "TO"}}, {"id": 5, "watch_condition": {"id": "tensor_all_zero", "params": [{"name": "zero_percentage_ge", "disable": false, "value": 100}], "abbr": "TZ"}}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/recommended_watchpoints_at_startup.json View File

@@ -0,0 +1 @@
{"watch_points": [{"id": 1, "watch_condition": {"id": "overflow", "params": [], "abbr": "OVERFLOW"}}]}

+ 1
- 672
tests/st/func/debugger/expect_results/restful_results/retrieve_aggregation_scope_node.json
File diff suppressed because it is too large
View File


+ 6
- 1
tests/st/func/debugger/expect_results/restful_results/retrieve_all.json View File

@@ -4,9 +4,14 @@
"step": 1,
"device_name": "0",
"node_name": "",
"backend": "Ascend"
"backend": "Ascend",
"enable_recheck": false,
"graph_name": "graph_0"
},
"graph": {
"graph_names": [
"graph_0"
],
"nodes": [
{
"name": "Default",


+ 5
- 6
tests/st/func/debugger/expect_results/restful_results/retrieve_empty_tensor_history.json View File

@@ -4,20 +4,19 @@
"name": "Default/TransData-op99:0",
"full_name": "Default/TransData-op99:0",
"node_type": "TransData",
"type": "output"
"type": "output",
"graph_name": "graph_0"
},
{
"name": "Default/args0:0",
"full_name": "Default/args0:0",
"node_type": "Parameter",
"type": "input"
"type": "input",
"graph_name": "graph_0"
}
],
"metadata": {
"state": "waiting",
"step": 1,
"device_name": "0",
"node_name": "",
"backend": "Ascend"
"step": 1
}
}

+ 3
- 4
tests/st/func/debugger/expect_results/restful_results/retrieve_full_tensor_history.json View File

@@ -5,6 +5,7 @@
"full_name": "Default/TransData-op99:0",
"node_type": "TransData",
"type": "output",
"graph_name": "graph_0",
"step": 1,
"dtype": "DT_FLOAT32",
"shape": [
@@ -19,6 +20,7 @@
"full_name": "Default/args0:0",
"node_type": "Parameter",
"type": "input",
"graph_name": "graph_0",
"step": 1,
"dtype": "DT_FLOAT32",
"shape": [
@@ -31,9 +33,6 @@
],
"metadata": {
"state": "waiting",
"step": 1,
"device_name": "0",
"node_name": "",
"backend": "Ascend"
"step": 1
}
}

+ 547
- 1
tests/st/func/debugger/expect_results/restful_results/retrieve_next_node_on_gpu.json
File diff suppressed because it is too large
View File


+ 1737
- 1
tests/st/func/debugger/expect_results/restful_results/retrieve_single_watchpoint_hit.json
File diff suppressed because it is too large
View File


+ 138
- 0
tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json View File

@@ -0,0 +1,138 @@
{
"graph": {
"nodes": [
{
"name": "Default/args0",
"full_name": "Default/args0",
"type": "Parameter",
"input": {},
"output": {
"Default/TransData-op99": {
"shape": [
[
32,
1,
32,
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0",
"statistics": {}
}
],
"graph_name": "graph_0"
},
{
"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190",
"full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190",
"type": "Cast",
"input": {
"Default/TransData-op99": {
"shape": [
[
32,
1,
32,
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {},
"slots": [
{
"slot": "0",
"statistics": {}
}
],
"graph_name": "graph_0"
},
{
"name": "Default/TransData-op99",
"full_name": "Default/TransData-op99",
"type": "TransData",
"input": {
"Default/args0": {
"shape": [
[
32,
1,
32,
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {
"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {
"shape": [
[
32,
1,
32,
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0",
"watch_points": [
{
"id": 1,
"watch_condition": {
"id": "inf",
"params": [],
"abbr": "INF"
}
}
],
"statistics": {}
}
],
"graph_name": "graph_0"
}
]
}
}

+ 72
- 0
tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-1.json View File

@@ -0,0 +1,72 @@
{
"graph": {
"nodes": [
{
"name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op38",
"full_name": "Default/optimizer-Momentum/ApplyMomentum-op38",
"type": "ApplyMomentum",
"input": {
"Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias": {
"shape": [
[
120
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {},
"slots": [
{
"slot": "0",
"statistics": {}
},
{
"slot": "1",
"statistics": {}
}
],
"graph_name": "graph_0"
},
{
"name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias",
"full_name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias",
"type": "Parameter",
"input": {},
"output": {
"Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op38": {
"shape": [
[
120
]
],
"edge_type": "data",
"independent_layout": true,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0",
"statistics": {}
}
],
"graph_name": "graph_0"
}
]
}
}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_hits-0.json View File

@@ -0,0 +1 @@
{"watch_points": []}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_hits-1.json View File

@@ -0,0 +1 @@
{"watch_points": []}

+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_value.json View File

@@ -1 +1 @@
{"tensor_value": {"full_name": "Default/TransData-op99:0", "step": 1, "dtype": "DT_FLOAT32", "shape": [2, 3], "has_prev_step": false, "statistics": {"max": 6.0, "min": 5.0, "avg": 5.5, "count": 2, "nan_count": 0, "neg_inf_count": 0, "pos_inf_count": 0, "overall_max": 6.0, "overall_min": 1.0}, "value": [5.0, 6.0], "name": "Default/TransData-op99:0"}}
{"tensor_value": {"full_name": "Default/TransData-op99:0", "step": 1, "dtype": "DT_FLOAT32", "shape": [2, 3], "has_prev_step": false, "statistics": {"max": 6.0, "min": 5.0, "avg": 5.5, "count": 2, "nan_count": 0, "neg_inf_count": 0, "pos_inf_count": 0, "overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "value": [5.0, 6.0], "name": "Default/TransData-op99:0"}}

+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json View File

@@ -1 +1 @@
{"watch_point_hits": [{"node_name": "Default/TransData-op99", "watch_points": [{"id": 1, "watch_condition": {"condition": "INF"}}]}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "watch_points": [{"id": 1, "watch_condition": {"condition": "INF"}}]}]}
{"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}]}], "graph_name": "graph_0"}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/search_activation.json View File

@@ -0,0 +1 @@
{"nodes": [{"name": "Default", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell/_backbone-LeNet5", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op12", "type": "ReLU", "nodes": []}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op15", "type": "ReLU", "nodes": []}]}]}]}]}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/search_activation_multi_graph.json View File

@@ -0,0 +1 @@
{"nodes": [{"name": "graph_0", "type": "name_scope", "nodes": [{"name": "graph_0/Default", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op12", "type": "ReLU", "nodes": []}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op15", "type": "ReLU", "nodes": []}]}]}]}]}]}, {"name": "graph_1", "type": "name_scope", "nodes": [{"name": "graph_1/Default", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op12", "type": "ReLU", "nodes": []}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op15", "type": "ReLU", "nodes": []}]}]}]}]}]}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/search_gradient.json View File

@@ -0,0 +1 @@
{"nodes": [{"name": "Gradients", "type": "name_scope", "nodes": [{"name": "Gradients/Default", "type": "name_scope", "nodes": [{"name": "Gradients/Default/network-WithLossCell", "type": "name_scope", "nodes": [{"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5", "type": "name_scope", "nodes": [{"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense", "type": "name_scope", "nodes": [{"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd", "type": "name_scope", "nodes": [{"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op21", "type": "BiasAddGrad", "nodes": []}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op29", "type": "BiasAddGrad", "nodes": []}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op37", "type": "BiasAddGrad", "nodes": []}]}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul", "type": "name_scope", "nodes": [{"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5", "type": "aggregation_scope", "nodes": [{"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op24", "type": "MatMul", "nodes": []}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op27", "type": "MatMul", "nodes": []}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op32", "type": "MatMul", "nodes": []}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op35", "type": "MatMul", "nodes": []}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op40", "type": "MatMul", "nodes": []}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradMatMul/MatMul[6]_5/MatMul-op44", "type": "MatMul", "nodes": []}]}]}]}]}]}]}]}]}

+ 31
- 1
tests/st/func/debugger/expect_results/restful_results/search_unwatched_leaf_node.json View File

@@ -1 +1,31 @@
{"nodes": [{"name": "Default", "type": "name_scope", "nodes": [{"name": "Default/optimizer-Momentum", "type": "name_scope", "nodes": [{"name": "Default/optimizer-Momentum/Parameter[18]_7", "type": "aggregation_scope", "nodes": [{"name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias", "type": "Parameter", "nodes": [], "watched": 0}], "watched": 1}], "watched": 1}], "watched": 1}]}
{
"nodes": [
{
"name": "Default",
"type": "name_scope",
"nodes": [
{
"name": "Default/optimizer-Momentum",
"type": "name_scope",
"nodes": [
{
"name": "Default/optimizer-Momentum/Parameter[18]_7",
"type": "aggregation_scope",
"nodes": [
{
"name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias",
"type": "Parameter",
"nodes": [],
"watched": 0
}
],
"watched": 0
}
],
"watched": 0
}
],
"watched": 0
}
]
}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/search_weight.json View File

@@ -0,0 +1 @@
{"nodes": [{"name": "Default", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell/_backbone-LeNet5", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense", "type": "name_scope", "nodes": [{"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6", "type": "aggregation_scope", "nodes": [{"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.weight", "type": "Parameter", "nodes": []}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.bias", "type": "Parameter", "nodes": []}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.weight", "type": "Parameter", "nodes": []}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.bias", "type": "Parameter", "nodes": []}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.weight", "type": "Parameter", "nodes": []}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.bias", "type": "Parameter", "nodes": []}]}]}]}]}]}]}

+ 1
- 0
tests/st/func/debugger/expect_results/restful_results/search_weight_multi_graph.json View File

@@ -0,0 +1 @@
{"nodes": [{"name": "graph_0", "type": "name_scope", "nodes": [{"name": "graph_0/Default", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/conv2.weight", "type": "Parameter", "nodes": []}]}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/conv1.weight", "type": "Parameter", "nodes": []}]}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense", "type": "name_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6", "type": "aggregation_scope", "nodes": [{"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.weight", "type": "Parameter", "nodes": []}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.bias", "type": "Parameter", "nodes": []}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.weight", "type": "Parameter", "nodes": []}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.bias", "type": "Parameter", "nodes": []}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.weight", "type": "Parameter", "nodes": []}, {"name": "graph_0/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.bias", "type": "Parameter", "nodes": []}]}]}]}]}]}]}, {"name": "graph_1", "type": "name_scope", "nodes": [{"name": "graph_1/Default", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/conv2.weight", "type": "Parameter", "nodes": []}]}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/conv1.weight", "type": "Parameter", "nodes": []}]}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense", "type": "name_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6", "type": "aggregation_scope", "nodes": [{"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.weight", "type": "Parameter", "nodes": []}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.bias", "type": "Parameter", "nodes": []}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.weight", "type": "Parameter", "nodes": []}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.bias", "type": "Parameter", "nodes": []}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.weight", "type": "Parameter", "nodes": []}, {"name": "graph_1/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.bias", "type": "Parameter", "nodes": []}]}]}]}]}]}]}]}

+ 22
- 7
tests/st/func/debugger/mock_ms_client.py View File

@@ -28,7 +28,7 @@ from tests.st.func.debugger.conftest import GRAPH_PROTO_FILE
class MockDebuggerClient:
"""Mocked Debugger client."""

def __init__(self, hostname='localhost:50051', backend='Ascend'):
def __init__(self, hostname='localhost:50051', backend='Ascend', graph_num=1):
channel = grpc.insecure_channel(hostname)
self.stub = EventListenerStub(channel)
self.flag = True
@@ -37,6 +37,7 @@ class MockDebuggerClient:
self._leaf_node = []
self._cur_node = ''
self._backend = backend
self._graph_num = graph_num

def _clean(self):
"""Clean cache."""
@@ -122,16 +123,32 @@ class MockDebuggerClient:
assert response.status == EventReply.Status.OK
if training_done is False:
self.send_graph_cmd()
print("finish")

def send_graph_cmd(self):
"""Send graph to debugger server."""
self._step = 1
if self._graph_num > 1:
chunks = []
for i in range(self._graph_num):
chunks.extend(self._get_graph_chunks('graph_' + str(i)))
response = self.stub.SendMultiGraphs(self._generate_graph(chunks))
else:
chunks = self._get_graph_chunks()
response = self.stub.SendGraph(self._generate_graph(chunks))
assert response.status == EventReply.Status.OK
# go to command loop
self.command_loop()

def _get_graph_chunks(self, graph_name='graph_0'):
"""Get graph chunks."""
with open(GRAPH_PROTO_FILE, 'rb') as file_handle:
content = file_handle.read()
size = len(content)
graph = ms_graph_pb2.GraphProto()
graph.ParseFromString(content)
graph.name = 'graph_name'
graph.name = graph_name
content = graph.SerializeToString()
self._leaf_node = [node.full_name for node in graph.node]
# the max limit of grpc data size is 4kb
# split graph into 3kb per chunk
@@ -141,10 +158,8 @@ class MockDebuggerClient:
sub_size = min(chunk_size, size - index)
sub_chunk = Chunk(buffer=content[index: index + sub_size])
chunks.append(sub_chunk)
response = self.stub.SendGraph(self._generate_graph(chunks))
assert response.status == EventReply.Status.OK
# go to command loop
self.command_loop()
chunks[-1].finished = True
return chunks

@staticmethod
def _generate_graph(chunks):
@@ -202,5 +217,5 @@ class MockDebuggerClientThread:
return self._debugger_client_thread

def __exit__(self, exc_type, exc_val, exc_tb):
self._debugger_client_thread.join(timeout=5)
self._debugger_client_thread.join(timeout=3)
self._debugger_client.flag = False

+ 419
- 86
tests/st/func/debugger/test_restful_api.py View File

@@ -22,12 +22,20 @@ import os

import pytest

from mindinsight.conf import settings
from tests.st.func.debugger.conftest import DEBUGGER_BASE_URL
from tests.st.func.debugger.mock_ms_client import MockDebuggerClient
from tests.st.func.debugger.utils import check_waiting_state, get_request_result, \
send_and_compare_result


def send_terminate_cmd(app_client):
"""Send terminate command to debugger client."""
url = os.path.join(DEBUGGER_BASE_URL, 'control')
body_data = {'mode': 'terminate'}
send_and_compare_result(app_client, url, body_data)


class TestAscendDebugger:
"""Test debugger on Ascend backend."""

@@ -36,23 +44,6 @@ class TestAscendDebugger:
"""Setup class."""
cls._debugger_client = MockDebuggerClient(backend='Ascend')

@staticmethod
def _send_terminate_cmd(app_client):
"""Send terminate command to debugger client."""
url = os.path.join(DEBUGGER_BASE_URL, 'control')
body_data = {'mode': 'terminate'}
send_and_compare_result(app_client, url, body_data)

@staticmethod
def _create_watchpoint(app_client, condition, expect_id):
"""Create watchpoint."""
url = 'create_watchpoint'
body_data = {'condition': condition,
'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7',
'Default/TransData-op99']}
res = get_request_result(app_client, url, body_data)
assert res.get('id') == expect_id

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@@ -86,10 +77,47 @@ class TestAscendDebugger:
"""Test retrieve when train_begin."""
url = 'retrieve'
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
send_and_compare_result(app_client, url, body_data, expect_file)
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

def test_get_conditions(self, app_client):
"""Test get conditions for ascend."""
url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions'
body_data = {}
expect_file = 'get_conditions_for_ascend.json'
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
send_and_compare_result(app_client, url, body_data, expect_file, method='get', full_url=True)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("body_data, expect_file", [
({'mode': 'all'}, 'multi_retrieve_all.json'),
({'mode': 'node', 'params': {'name': 'Default', 'graph_name': 'graph_1'}}, 'retrieve_scope_node.json'),
({'mode': 'node', 'params': {'name': 'graph_0'}}, 'multi_retrieve_scope_node.json'),
({'mode': 'node', 'params': {'name': 'graph_0/Default/optimizer-Momentum/Parameter[18]_7'}},
'multi_retrieve_aggregation_scope_node.json'),
({'mode': 'node', 'params': {
'name': 'graph_0/Default/TransData-op99',
'single_node': True}}, 'multi_retrieve_single_node.json'),
({'mode': 'node', 'params': {
'name': 'Default/TransData-op99',
'single_node': True, 'graph_name': 'graph_0'}}, 'retrieve_single_node.json')
])
def test_multi_retrieve_when_train_begin(self, app_client, body_data, expect_file):
"""Test retrieve when train_begin."""
url = 'retrieve'
debugger_client = MockDebuggerClient(backend='Ascend', graph_num=2)
with debugger_client.get_thread_instance():
check_waiting_state(app_client)
send_and_compare_result(app_client, url, body_data, expect_file)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@@ -100,22 +128,21 @@ class TestAscendDebugger:
def test_create_and_delete_watchpoint(self, app_client):
"""Test create and delete watchpoint."""
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
conditions = [
{'condition': 'MAX_GT', 'param': 1.0},
{'condition': 'MAX_LT', 'param': -1.0},
{'condition': 'MIN_GT', 'param': 1e+32},
{'condition': 'MIN_LT', 'param': -1e+32},
{'condition': 'MAX_MIN_GT', 'param': 0},
{'condition': 'MAX_MIN_LT', 'param': 0},
{'condition': 'MEAN_GT', 'param': 0},
{'condition': 'MEAN_LT', 'param': 0},
{'condition': 'INF'},
{'condition': 'OVERFLOW'},
{'id': 'max_gt', 'params': [{'name': 'param', 'value': 1.0, 'disable': False}]},
{'id': 'max_lt', 'params': [{'name': 'param', 'value': -1.0, 'disable': False}]},
{'id': 'min_gt', 'params': [{'name': 'param', 'value': 1e+32, 'disable': False}]},
{'id': 'min_lt', 'params': [{'name': 'param', 'value': -1e+32, 'disable': False}]},
{'id': 'max_min_gt', 'params': [{'name': 'param', 'value': 0, 'disable': False}]},
{'id': 'max_min_lt', 'params': [{'name': 'param', 'value': 0, 'disable': False}]},
{'id': 'mean_gt', 'params': [{'name': 'param', 'value': 0, 'disable': False}]},
{'id': 'mean_lt', 'params': [{'name': 'param', 'value': 0, 'disable': False}]},
{'id': 'inf', 'params': []},
{'id': 'overflow', 'params': []},
]
for idx, condition in enumerate(conditions):
self._create_watchpoint(app_client, condition, idx + 1)
create_watchpoint(app_client, condition, idx + 1)
# delete 4-th watchpoint
url = 'delete_watchpoint'
body_data = {'watch_point_id': 4}
@@ -125,7 +152,7 @@ class TestAscendDebugger:
body_data = {'mode': 'watchpoint'}
expect_file = 'create_and_delete_watchpoint.json'
send_and_compare_result(app_client, url, body_data, expect_file)
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@@ -138,10 +165,9 @@ class TestAscendDebugger:
watch_point_id = 1
leaf_node_name = 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias'
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
condition = {'condition': 'INF'}
self._create_watchpoint(app_client, condition, watch_point_id)
check_waiting_state(app_client)
condition = {'id': 'inf', 'params': []}
create_watchpoint(app_client, condition, watch_point_id)
# update watchpoint watchpoint list
url = 'update_watchpoint'
body_data = {'watch_point_id': watch_point_id,
@@ -153,7 +179,7 @@ class TestAscendDebugger:
body_data = {'name': leaf_node_name, 'watch_point_id': watch_point_id}
expect_file = 'search_unwatched_leaf_node.json'
send_and_compare_result(app_client, url, body_data, expect_file, method='get')
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@@ -164,18 +190,7 @@ class TestAscendDebugger:
def test_watchpoint_hit(self, app_client):
"""Test retrieve watchpoint hit."""
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
self._create_watchpoint(app_client, condition={'condition': 'INF'}, expect_id=1)
# send run command to get watchpoint hit
url = 'control'
body_data = {'mode': 'continue',
'steps': 2}
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'running'}}
# wait for server has received watchpoint hit
flag = check_waiting_state(app_client)
assert flag is True
create_watchpoint_and_wait(app_client)
# check watchpoint hit list
url = 'retrieve'
body_data = {'mode': 'watchpoint_hit'}
@@ -188,11 +203,11 @@ class TestAscendDebugger:
'name': 'Default/TransData-op99',
'single_node': True,
'watch_point_id': 1
}
}
}
expect_file = 'retrieve_single_watchpoint_hit.json'
send_and_compare_result(app_client, url, body_data, expect_file)
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@@ -204,8 +219,7 @@ class TestAscendDebugger:
"""Test retrieve tensor value."""
node_name = 'Default/TransData-op99'
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
# prepare tensor value
url = 'retrieve_tensor_history'
body_data = {'name': node_name}
@@ -226,7 +240,7 @@ class TestAscendDebugger:
}
expect_file = 'retrieve_tensor_value.json'
send_and_compare_result(app_client, url, body_data, expect_file, method='get')
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@@ -238,15 +252,13 @@ class TestAscendDebugger:
"""Test compare tensor value."""
node_name = 'Default/args0'
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
# prepare tensor values
url = 'control'
body_data = {'mode': 'continue',
'steps': 2}
get_request_result(app_client, url, body_data)
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
get_request_result(
app_client=app_client, url='retrieve_tensor_history', body_data={'name': node_name})
res = get_request_result(
@@ -262,7 +274,7 @@ class TestAscendDebugger:
}
expect_file = 'compare_tensors.json'
send_and_compare_result(app_client, url, body_data, expect_file, method='get')
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@@ -277,12 +289,110 @@ class TestAscendDebugger:
def test_retrieve_bfs_node(self, app_client, body_data, expect_file):
"""Test retrieve bfs node."""
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
# prepare tensor values
url = 'retrieve_node_by_bfs'
send_and_compare_result(app_client, url, body_data, expect_file, method='get')
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_pause(self, app_client):
"""Test pause the training."""
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
# send run command to execute to next node
url = 'control'
body_data = {'mode': 'continue',
'steps': -1}
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'running', 'enable_recheck': False}}
# send pause command
url = 'control'
body_data = {'mode': 'pause'}
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'waiting', 'enable_recheck': False}}
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("url, body_data, enable_recheck", [
('create_watchpoint',
{'condition': {'id': 'inf', 'params': []},
'watch_nodes': ['Default']}, True),
('update_watchpoint',
{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'],
'mode': 0}, True),
('update_watchpoint',
{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum'],
'mode': 1}, True),
('delete_watchpoint', {}, True)
])
def test_recheck(self, app_client, url, body_data, enable_recheck):
"""Test recheck."""
with self._debugger_client.get_thread_instance():
create_watchpoint_and_wait(app_client)
# create watchpoint
res = get_request_result(app_client, url, body_data, method='post')
assert res['metadata']['enable_recheck'] is enable_recheck
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_recommend_watchpoints(self, app_client):
"""Test generating recommended watchpoints."""
original_value = settings.ENABLE_RECOMMENDED_WATCHPOINTS
settings.ENABLE_RECOMMENDED_WATCHPOINTS = True
try:
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
url = 'retrieve'
body_data = {'mode': 'watchpoint'}
expect_file = 'recommended_watchpoints_at_startup.json'
send_and_compare_result(app_client, url, body_data, expect_file, method='post')
send_terminate_cmd(app_client)
finally:
settings.ENABLE_RECOMMENDED_WATCHPOINTS = original_value

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("body_data, expect_file", [
({'tensor_name': 'Default/TransData-op99:0', 'graph_name': 'graph_0'}, 'retrieve_tensor_graph-0.json'),
({'tensor_name': 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias:0', 'graph_name': 'graph_0'},
'retrieve_tensor_graph-1.json')
])
def test_retrieve_tensor_graph(self, app_client, body_data, expect_file):
"""Test retrieve tensor graph."""
url = 'tensor_graphs'
with self._debugger_client.get_thread_instance():
create_watchpoint_and_wait(app_client)
send_and_compare_result(app_client, url, body_data, expect_file, method='GET')
send_terminate_cmd(app_client)


class TestGPUDebugger:
"""Test debugger on Ascend backend."""

@classmethod
def setup_class(cls):
"""Setup class."""
cls._debugger_client = MockDebuggerClient(backend='GPU')

@pytest.mark.level0
@pytest.mark.env_single
@@ -294,23 +404,21 @@ class TestAscendDebugger:
"""Test get next node on GPU."""
gpu_debugger_client = MockDebuggerClient(backend='GPU')
with gpu_debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
# send run command to get watchpoint hit
url = 'control'
body_data = {'mode': 'continue',
'level': 'node',
'name': 'Default/TransData-op99'}
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'running'}}
assert res == {'metadata': {'state': 'running', 'enable_recheck': False}}
# get metadata
flag = check_waiting_state(app_client)
assert flag is True
check_waiting_state(app_client)
url = 'retrieve'
body_data = {'mode': 'all'}
expect_file = 'retrieve_next_node_on_gpu.json'
send_and_compare_result(app_client, url, body_data, expect_file)
self._send_terminate_cmd(app_client)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@@ -318,20 +426,245 @@ class TestAscendDebugger:
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_pause(self, app_client):
"""Test pause the training."""
@pytest.mark.parametrize("url, body_data, enable_recheck", [
('create_watchpoint',
{'condition': {'id': 'inf', 'params': []},
'watch_nodes': ['Default']}, False),
('create_watchpoint',
{'condition': {'id': 'inf', 'params': []},
'watch_nodes': ['Default/TransData-op99']}, True),
('update_watchpoint',
{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'],
'mode': 0}, True),
('update_watchpoint',
{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum'],
'mode': 1}, False),
('update_watchpoint',
[{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum'],
'mode': 1},
{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum'],
'mode': 0}
], True),
('update_watchpoint',
[{'watch_point_id': 1, 'watch_nodes': ['Default/TransData-op99'],
'mode': 0},
{'watch_point_id': 1, 'watch_nodes': ['Default/TransData-op99'],
'mode': 1}
], True),
('delete_watchpoint', {'watch_point_id': 1}, True)
])
def test_recheck_state(self, app_client, url, body_data, enable_recheck):
"""Test update watchpoint and check the value of enable_recheck."""
with self._debugger_client.get_thread_instance():
flag = check_waiting_state(app_client)
assert flag is True
# send run command to execute to next node
url = 'control'
body_data = {'mode': 'continue',
'steps': -1}
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'running'}}
# send pause command
url = 'control'
body_data = {'mode': 'pause'}
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'waiting'}}
self._send_terminate_cmd(app_client)
create_watchpoint_and_wait(app_client)
if not isinstance(body_data, list):
body_data = [body_data]
for sub_body_data in body_data:
res = get_request_result(app_client, url, sub_body_data, method='post')
assert res['metadata']['enable_recheck'] is enable_recheck
send_terminate_cmd(app_client)

def test_get_conditions(self, app_client):
"""Test get conditions for gpu."""
url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions'
body_data = {}
expect_file = 'get_conditions_for_gpu.json'
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
send_and_compare_result(app_client, url, body_data, expect_file, method='get', full_url=True)
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_recheck(self, app_client):
"""Test recheck request."""
with self._debugger_client.get_thread_instance():
create_watchpoint_and_wait(app_client)
# send recheck when disable to do recheck
get_request_result(app_client, 'recheck', {}, method='post', expect_code=400)
# send recheck when enable to do recheck
create_watchpoint(app_client, {'id': 'inf', 'params': []}, 2)
res = get_request_result(app_client, 'recheck', {}, method='post')
assert res['metadata']['enable_recheck'] is False

send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("filter_condition, expect_file", [
({'name': 'fc', 'node_category': 'weight'}, 'search_weight.json'),
({'name': 'fc', 'node_category': 'gradient'}, 'search_gradient.json'),
({'node_category': 'activation'}, 'search_activation.json')
])
def test_search_by_category(self, app_client, filter_condition, expect_file):
"""Test recheck request."""
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
send_and_compare_result(app_client, 'search', filter_condition, expect_file,
method='get')
send_terminate_cmd(app_client)


class TestMultiGraphDebugger:
"""Test debugger on Ascend backend."""

@classmethod
def setup_class(cls):
"""Setup class."""
cls._debugger_client = MockDebuggerClient(backend='Ascend', graph_num=2)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("body_data, expect_file", [
({'mode': 'all'}, 'multi_retrieve_all.json'),
({'mode': 'node', 'params': {'name': 'Default', 'graph_name': 'graph_1'}}, 'retrieve_scope_node.json'),
({'mode': 'node', 'params': {'name': 'graph_0'}}, 'multi_retrieve_scope_node.json'),
({'mode': 'node', 'params': {'name': 'graph_0/Default/optimizer-Momentum/Parameter[18]_7'}},
'multi_retrieve_aggregation_scope_node.json'),
({'mode': 'node', 'params': {
'name': 'graph_0/Default/TransData-op99',
'single_node': True}}, 'multi_retrieve_single_node.json'),
({'mode': 'node', 'params': {
'name': 'Default/TransData-op99',
'single_node': True, 'graph_name': 'graph_0'}}, 'retrieve_single_node.json')
])
def test_multi_retrieve_when_train_begin(self, app_client, body_data, expect_file):
"""Test retrieve when train_begin."""
url = 'retrieve'
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
send_and_compare_result(app_client, url, body_data, expect_file)
send_terminate_cmd(app_client)


@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("filter_condition, expect_file", [
({'name': '', 'node_category': 'weight'}, 'search_weight_multi_graph.json'),
({'node_category': 'activation'}, 'search_activation_multi_graph.json')
])
def test_search_by_category_with_multi_graph(self, app_client, filter_condition, expect_file):
"""Test search by category request."""
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
send_and_compare_result(app_client, 'search', filter_condition, expect_file, method='get')
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("filter_condition, expect_id", [
({'condition': {'id': 'inf'},
'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'],
'graph_name': 'graph_0'}, 1),
({'condition': {'id': 'inf'},
'watch_nodes': ['graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1'],
'graph_name': None}, 1)
])
def test_create_watchpoint(self, app_client, filter_condition, expect_id):
"""Test create watchpoint with multiple graphs."""
url = 'create_watchpoint'
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
res = get_request_result(app_client, url, filter_condition)
assert res.get('id') == expect_id
send_terminate_cmd(app_client)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("params, expect_file", [
({'level': 'node'}, 'multi_next_node.json'),
({'level': 'node', 'node_name': 'graph_0/Default/TransData-op99'}, 'multi_next_node.json'),
({'level': 'node', 'node_name': 'Default/TransData-op99', 'graph_name': 'graph_0'},
'multi_next_node.json')
])
def test_continue_on_gpu(self, app_client, params, expect_file):
"""Test get next node on GPU."""
gpu_debugger_client = MockDebuggerClient(backend='GPU', graph_num=2)
original_value = settings.ENABLE_RECOMMENDED_WATCHPOINTS
settings.ENABLE_RECOMMENDED_WATCHPOINTS = True
try:
with gpu_debugger_client.get_thread_instance():
check_waiting_state(app_client)
# send run command to get watchpoint hit
url = 'control'
body_data = {'mode': 'continue'}
body_data.update(params)
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'running', 'enable_recheck': False}}
# get metadata
check_waiting_state(app_client)
url = 'retrieve'
body_data = {'mode': 'all'}
send_and_compare_result(app_client, url, body_data, expect_file)
send_terminate_cmd(app_client)
finally:
settings.ENABLE_RECOMMENDED_WATCHPOINTS = original_value

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("body_data, expect_file", [
({'tensor_name': 'Default/TransData-op99:0', 'graph_name': 'graph_0'}, 'retrieve_tensor_hits-0.json'),
({'tensor_name': 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias:0', 'graph_name': 'graph_0'},
'retrieve_tensor_hits-1.json')
])
def test_retrieve_tensor_hits(self, app_client, body_data, expect_file):
"""Test retrieve tensor graph."""
url = 'tensor_hits'
with self._debugger_client.get_thread_instance():
check_waiting_state(app_client)
send_and_compare_result(app_client, url, body_data, expect_file, method='GET')
send_terminate_cmd(app_client)


def create_watchpoint(app_client, condition, expect_id):
"""Create watchpoint."""
url = 'create_watchpoint'
body_data = {'condition': condition,
'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7',
'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias',
'Default/optimizer-Momentum/Parameter[18]_7/moments.fc1.bias',
'Default/TransData-op99']}
res = get_request_result(app_client, url, body_data)
assert res.get('id') == expect_id


def create_watchpoint_and_wait(app_client):
"""Preparation for recheck."""
check_waiting_state(app_client)
create_watchpoint(app_client, condition={'id': 'inf', 'params': []}, expect_id=1)
# send run command to get watchpoint hit
url = 'control'
body_data = {'mode': 'continue',
'steps': 2}
res = get_request_result(app_client, url, body_data)
assert res == {'metadata': {'state': 'running', 'enable_recheck': False}}
# wait for server has received watchpoint hit
check_waiting_state(app_client)

+ 11
- 6
tests/st/func/debugger/utils.py View File

@@ -27,19 +27,24 @@ def check_waiting_state(app_client):
body_data = {'mode': 'all'}
max_try_times = 30
count = 0
flag = False
while count < max_try_times:
res = get_request_result(app_client, url, body_data)
state = res.get('metadata', {}).get('state')
if state == 'waiting':
return True
flag = True
break
count += 1
time.sleep(0.1)
return False
assert flag is True


def get_request_result(app_client, url, body_data, method='post', expect_code=200):
def get_request_result(app_client, url, body_data, method='post', expect_code=200, full_url=False):
"""Get request result."""
real_url = os.path.join(DEBUGGER_BASE_URL, url)
if not full_url:
real_url = os.path.join(DEBUGGER_BASE_URL, url)
else:
real_url = url
if method == 'post':
response = app_client.post(real_url, data=json.dumps(body_data))
else:
@@ -50,9 +55,9 @@ def get_request_result(app_client, url, body_data, method='post', expect_code=20
return res


def send_and_compare_result(app_client, url, body_data, expect_file=None, method='post'):
def send_and_compare_result(app_client, url, body_data, expect_file=None, method='post', full_url=False):
"""Send and compare result."""
res = get_request_result(app_client, url, body_data, method=method)
res = get_request_result(app_client, url, body_data, method=method, full_url=full_url)
delete_random_items(res)
if expect_file:
real_path = os.path.join(DEBUGGER_EXPECTED_RESULTS, 'restful_results', expect_file)


+ 12
- 19
tests/ut/debugger/configurations.py View File

@@ -18,8 +18,6 @@ import os

from google.protobuf import json_format

from mindinsight.datavisual.data_transform.graph import NodeTypeEnum
from mindinsight.debugger.common.utils import NodeBasicInfo
from mindinsight.debugger.proto import ms_graph_pb2
from mindinsight.debugger.stream_handler.graph_handler import GraphHandler
from mindinsight.debugger.stream_handler.watchpoint_handler import WatchpointHitHandler
@@ -46,7 +44,7 @@ def init_graph_handler():
"""Init GraphHandler."""
graph = get_graph_proto()
graph_handler = GraphHandler()
graph_handler.put(graph)
graph_handler.put({graph.name: graph})

return graph_handler

@@ -64,16 +62,10 @@ def get_node_basic_infos(node_names):
if not node_names:
return []
graph_stream = init_graph_handler()
graph_name = graph_stream.graph_names[0]
node_infos = []
for node_name in node_names:
node_type = graph_stream.get_node_type(node_name)
if node_type == NodeTypeEnum.AGGREGATION_SCOPE.value:
sub_nodes = graph_stream.get_nodes_by_scope(node_name)
sub_infos = [NodeBasicInfo(name=node.name, full_name=node.full_name, type=node.type)
for node in sub_nodes]
node_infos.extend(sub_infos)
full_name = graph_stream.get_full_name(node_name)
node_infos.append(NodeBasicInfo(name=node_name, full_name=full_name, type=node_type))
node_infos.append(graph_stream.get_node_basic_info(node_name, graph_name))
return node_infos


@@ -81,13 +73,10 @@ def get_watch_nodes_by_search(watch_nodes):
"""Get watched leaf nodes by search name."""
watched_leaf_nodes = []
graph_stream = init_graph_handler()
graph_name = graph_stream.graph_names[0]
for search_name in watch_nodes:
search_nodes = graph_stream.get_searched_node_list()
search_node_names = [
NodeBasicInfo(name=node.name, full_name=node.full_name, type=node.type)
for node in search_nodes
if node.name.startswith(search_name)]
watched_leaf_nodes.extend(search_node_names)
search_node_info = graph_stream.get_node_basic_info_by_scope(search_name, graph_name)
watched_leaf_nodes.extend(search_node_info)

return watched_leaf_nodes

@@ -141,7 +130,7 @@ def mock_tensor_history():
return tensor_history


def compare_debugger_result_with_file(res, expect_file):
def compare_debugger_result_with_file(res, expect_file, save=False):
"""
Compare debugger result with file.

@@ -150,4 +139,8 @@ def compare_debugger_result_with_file(res, expect_file):
expect_file: The expected file name.
"""
real_path = os.path.join(DEBUGGER_EXPECTED_RESULTS, expect_file)
compare_result_with_file(res, real_path)
if save:
with open(real_path, 'w') as file_handler:
json.dump(res, file_handler)
else:
compare_result_with_file(res, real_path)

+ 1
- 1
tests/ut/debugger/expected_results/debugger_server/retrieve_all.json View File

@@ -1 +1 @@
{"metadata": {"state": "waiting", "step": 0, "device_name": "", "pos": "0", "ip": "", "node_name": "", "backend": ""}, "graph": {}, "watch_points": []}
{"metadata": {"state": "waiting", "step": 0, "device_name": "", "pos": "0", "ip": "", "node_name": "", "backend": "", "enable_recheck": false, "graph_name": ""}, "graph": {}, "watch_points": []}

+ 36
- 1
tests/ut/debugger/expected_results/debugger_server/retrieve_tensor_history.json View File

@@ -1 +1,36 @@
{"tensor_history": [{"name": "Default/TransData-op99:0", "full_name": "Default/TransData-op99:0", "node_type": "TransData", "type": "output", "step": 0, "dtype": "DT_FLOAT32", "shape": [2, 3], "has_prev_step": false, "value": "click to view"}, {"name": "Default/args0:0", "full_name": "Default/args0:0", "node_type": "Parameter", "type": "input", "step": 0, "dtype": "DT_FLOAT32", "shape": [2, 3], "has_prev_step": false, "value": "click to view"}], "metadata": {"state": "waiting", "step": 0, "device_name": "", "pos": "0", "ip": "", "node_name": "", "backend": ""}}
{
"tensor_history": [
{
"name": "Default/TransData-op99:0",
"full_name": "Default/TransData-op99:0",
"node_type": "TransData",
"type": "output",
"step": 0,
"dtype": "DT_FLOAT32",
"shape": [
2,
3
],
"has_prev_step": false,
"value": "click to view"
},
{
"name": "Default/args0:0",
"full_name": "Default/args0:0",
"node_type": "Parameter",
"type": "input",
"step": 0,
"dtype": "DT_FLOAT32",
"shape": [
2,
3
],
"has_prev_step": false,
"value": "click to view"
}
],
"metadata": {
"state": "waiting",
"step": 0
}
}

+ 197
- 0
tests/ut/debugger/expected_results/graph/get_tensor_graph-0.json View File

@@ -0,0 +1,197 @@
{
"graph": {
"nodes": [
{
"name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/args1",
"full_name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/args1",
"type": "Parameter",
"input": {},
"output": {
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0": {
"shape": [
[
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_INT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/cst1",
"full_name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/cst1",
"type": "Const",
"input": {},
"output": {
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/cst2",
"full_name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/cst2",
"type": "Const",
"input": {},
"output": {
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/SoftmaxCrossEntropyWithLogits-op18",
"full_name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/SoftmaxCrossEntropyWithLogits-op18",
"type": "SoftmaxCrossEntropyWithLogits",
"input": {
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0": {
"shape": [
[
32,
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {},
"slots": [
{
"slot": "0"
},
{
"slot": "1"
}
]
},
{
"name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0",
"full_name": "Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0",
"type": "OneHot",
"input": {
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/args1": {
"shape": [
[
32
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_INT32]",
"slot_mapping": [
[
"0",
""
]
]
},
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/cst1": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "",
"slot_mapping": [
[
"0",
""
]
]
},
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/cst2": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {
"Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/SoftmaxCrossEntropyWithLogits-op18": {
"shape": [
[
32,
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0"
}
]
}
]
}
}

+ 176
- 0
tests/ut/debugger/expected_results/graph/get_tensor_graph-1.json View File

@@ -0,0 +1,176 @@
{
"graph": {
"nodes": [
{
"name": "Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/Cast-op201",
"full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/Cast-op201",
"type": "Cast",
"input": {},
"output": {
"Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLUV2-op89": {
"shape": [
[
32,
16,
10,
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Default/tuple_getitem[10]_0/tuple_getitem-op203",
"full_name": "Default/tuple_getitem-op203",
"type": "tuple_getitem",
"input": {
"Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLUV2-op89": {
"shape": [
[],
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TUPLE['DT_TENSOR[DT_FLOAT32]', 'DT_TENSOR[DT_UINT8]']",
"slot_mapping": [
[
"0",
""
],
[
"1",
""
]
]
}
},
"output": {},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Default/tuple_getitem[10]_0/tuple_getitem-op202",
"full_name": "Default/tuple_getitem-op202",
"type": "tuple_getitem",
"input": {
"Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLUV2-op89": {
"shape": [
[],
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TUPLE['DT_TENSOR[DT_FLOAT32]', 'DT_TENSOR[DT_UINT8]']",
"slot_mapping": [
[
"0",
""
],
[
"1",
""
]
]
}
},
"output": {},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLUV2-op89",
"full_name": "Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLUV2-op89",
"type": "ReLUV2",
"input": {
"Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/Cast-op201": {
"shape": [
[
32,
16,
10,
10
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_FLOAT32]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {
"Default/tuple_getitem[10]_0/tuple_getitem-op203": {
"shape": [
[],
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TUPLE['DT_TENSOR[DT_FLOAT32]', 'DT_TENSOR[DT_UINT8]']",
"slot_mapping": [
[
"0",
""
],
[
"1",
""
]
]
},
"Default/tuple_getitem[10]_0/tuple_getitem-op202": {
"shape": [
[],
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TUPLE['DT_TENSOR[DT_FLOAT32]', 'DT_TENSOR[DT_UINT8]']",
"slot_mapping": [
[
"0",
""
],
[
"1",
""
]
]
}
},
"slots": [
{
"slot": "0"
},
{
"slot": "1"
}
]
}
]
}
}

+ 166
- 0
tests/ut/debugger/expected_results/graph/get_tensor_graph-2.json View File

@@ -0,0 +1,166 @@
{
"graph": {
"nodes": [
{
"name": "Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/MaxPoolWithArgmax-op7",
"full_name": "Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/MaxPoolWithArgmax-op7",
"type": "MaxPoolWithArgmax",
"input": {},
"output": {
"Default/tuple_getitem[10]_0/tuple_getitem-op206": {
"shape": [
[],
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TUPLE['DT_TENSOR[DT_FLOAT32]', 'DT_TENSOR[DT_UINT16]']",
"slot_mapping": [
[
"0",
""
],
[
"1",
""
]
]
}
},
"slots": [
{
"slot": "0"
},
{
"slot": "1"
}
]
},
{
"name": "Default/tuple_getitem[10]_0/cst28",
"full_name": "Default/tuple_getitem[10]_0/cst28",
"type": "Const",
"input": {},
"output": {
"Default/tuple_getitem[10]_0/tuple_getitem-op206": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/gradMaxPoolWithArgmax/MaxPoolGradWithArgmax-op46",
"full_name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/gradMaxPoolWithArgmax/MaxPoolGradWithArgmax-op46",
"type": "MaxPoolGradWithArgmax",
"input": {
"Default/tuple_getitem[10]_0/tuple_getitem-op206": {
"shape": [
[
32,
16,
4,
3
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT16]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {},
"slots": [
{
"slot": "0"
}
]
},
{
"name": "Default/tuple_getitem[10]_0/tuple_getitem-op206",
"full_name": "Default/tuple_getitem-op206",
"type": "tuple_getitem",
"input": {
"Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/MaxPoolWithArgmax-op7": {
"shape": [
[],
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TUPLE['DT_TENSOR[DT_FLOAT32]', 'DT_TENSOR[DT_UINT16]']",
"slot_mapping": [
[
"0",
""
],
[
"1",
""
]
]
},
"Default/tuple_getitem[10]_0/cst28": {
"shape": [
[]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "",
"slot_mapping": [
[
"0",
""
]
]
}
},
"output": {
"Gradients/Default/network-WithLossCell/_backbone-LeNet5/max_pool2d-MaxPool2d/gradMaxPoolWithArgmax/MaxPoolGradWithArgmax-op46": {
"shape": [
[
32,
16,
4,
3
]
],
"edge_type": "data",
"independent_layout": false,
"data_type": "DT_TENSOR[DT_UINT16]",
"slot_mapping": [
[
"0",
""
]
]
}
},
"slots": [
{
"slot": "0"
}
]
}
]
}
}

+ 1
- 1
tests/ut/debugger/expected_results/graph/graph_handler_get_1_no_filter_condintion.json
File diff suppressed because it is too large
View File


+ 1
- 0
tests/ut/debugger/expected_results/graph/search_nodes_by_type_0.json View File

@@ -0,0 +1 @@
{"node_names": ["Default/network-WithLossCell/_backbone-LeNet5/conv2-Conv2d/conv2.weight", "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/conv1.weight", "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.weight", "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc1.bias", "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.weight", "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc2.bias", "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.weight", "Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/Parameter[6]_6/fc3.bias"]}

+ 1
- 0
tests/ut/debugger/expected_results/graph/search_nodes_by_type_1.json View File

@@ -0,0 +1 @@
{"node_names": ["Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op12", "Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLU-op15"]}

+ 0
- 1
tests/ut/debugger/expected_results/graph/tenor_hist_0.json View File

@@ -1 +0,0 @@
{"tensor_history": [{"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190:0", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190:0", "node_type": "Cast", "type": "output"}, {"name": "Default/TransData-op99:0", "full_name": "Default/TransData-op99:0", "node_type": "TransData", "type": "input"}]}

+ 18
- 0
tests/ut/debugger/expected_results/graph/tensor_hist_0.json View File

@@ -0,0 +1,18 @@
{
"tensor_history": [
{
"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190:0",
"full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190:0",
"node_type": "Cast",
"type": "output",
"graph_name": "kernel_graph_0"
},
{
"name": "Default/TransData-op99:0",
"full_name": "Default/TransData-op99:0",
"node_type": "TransData",
"type": "input",
"graph_name": "kernel_graph_0"
}
]
}

+ 53
- 1
tests/ut/debugger/expected_results/graph/tensor_hist_1.json View File

@@ -1 +1,53 @@
{"tensor_history": [{"name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22:0", "full_name": "Default/optimizer-Momentum/ApplyMomentum-op22:0", "node_type": "ApplyMomentum", "type": "output"}, {"name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22:1", "full_name": "Default/optimizer-Momentum/ApplyMomentum-op22:1", "node_type": "ApplyMomentum", "type": "output"}, {"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op21:0", "full_name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op21:0", "node_type": "BiasAddGrad", "type": "input"}, {"name": "Default/optimizer-Momentum/Parameter[18]_7/fc3.bias:0", "full_name": "Default/optimizer-Momentum/Parameter[18]_7/fc3.bias:0", "node_type": "Parameter", "type": "input"}, {"name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias:0", "full_name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias:0", "node_type": "Parameter", "type": "input"}, {"name": "Default/optimizer-Momentum/Parameter[18]_7/learning_rate:0", "full_name": "Default/optimizer-Momentum/Parameter[18]_7/learning_rate:0", "node_type": "Parameter", "type": "input"}, {"name": "Default/optimizer-Momentum/Parameter[18]_7/momentum:0", "full_name": "Default/optimizer-Momentum/Parameter[18]_7/momentum:0", "node_type": "Parameter", "type": "input"}]}
{
"tensor_history": [
{
"name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22:0",
"full_name": "Default/optimizer-Momentum/ApplyMomentum-op22:0",
"node_type": "ApplyMomentum",
"type": "output",
"graph_name": "kernel_graph_0"
},
{
"name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22:1",
"full_name": "Default/optimizer-Momentum/ApplyMomentum-op22:1",
"node_type": "ApplyMomentum",
"type": "output",
"graph_name": "kernel_graph_0"
},
{
"name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op21:0",
"full_name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/fc3-Dense/gradBiasAdd/BiasAddGrad-op21:0",
"node_type": "BiasAddGrad",
"type": "input",
"graph_name": "kernel_graph_0"
},
{
"name": "Default/optimizer-Momentum/Parameter[18]_7/fc3.bias:0",
"full_name": "Default/optimizer-Momentum/Parameter[18]_7/fc3.bias:0",
"node_type": "Parameter",
"type": "input",
"graph_name": "kernel_graph_0"
},
{
"name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias:0",
"full_name": "Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias:0",
"node_type": "Parameter",
"type": "input",
"graph_name": "kernel_graph_0"
},
{
"name": "Default/optimizer-Momentum/Parameter[18]_7/learning_rate:0",
"full_name": "Default/optimizer-Momentum/Parameter[18]_7/learning_rate:0",
"node_type": "Parameter",
"type": "input",
"graph_name": "kernel_graph_0"
},
{
"name": "Default/optimizer-Momentum/Parameter[18]_7/momentum:0",
"full_name": "Default/optimizer-Momentum/Parameter[18]_7/momentum:0",
"node_type": "Parameter",
"type": "input",
"graph_name": "kernel_graph_0"
}
]
}

+ 6
- 14
tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_0.json View File

@@ -3,31 +3,23 @@
"watchCondition": {
"condition": "inf"
},
"id": 1
"id": 1,
"watch_nodes_num": 0
},
{
"watchCondition": {
"condition": "inf"
},
"id": 2,
"watchNodes": [
{
"nodeName": "Default",
"nodeType": "scope"
}
]
"watch_nodes_num": 172
},
{
"watchCondition": {
"condition": "max_gt",
"value": 1.0
"params": [{"name": "param", "value": 1}],
"value": 1
},
"id": 3,
"watchNodes": [
{
"nodeName": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92",
"nodeType": "leaf"
}
]
"watch_nodes_num": 1
}
]

+ 1
- 1
tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_1.json View File

@@ -1 +1 @@
[{"id": 1, "watch_condition": {"condition": "INF"}}, {"id": 2, "watch_condition": {"condition": "INF"}}, {"id": 3, "watch_condition": {"condition": "MAX_GT", "param": 1}}]
[{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 2, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 3, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1, "disable": false}], "abbr": "MAX>"}}]

+ 22
- 1
tests/ut/debugger/expected_results/watchpoint/watchpoint_hit_handler_get_0.json View File

@@ -1 +1,22 @@
{"watch_point_hits": [{"node_name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92", "watch_points": [{"id": 1, "watch_condition": {"condition": "MAX_GT", "param": 1}}]}]}
{
"watch_point_hits": [
{
"node_name": "Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92",
"tensors": [
{
"slot": "0",
"watch_points": [
{
"id": 1,
"watch_condition": {
"condition": "MAX_GT",
"param": 1
}
}
]
}
],
"graph_name": "kernel_graph_0"
}
]
}

+ 15
- 0
tests/ut/debugger/stream_cache/__init__.py View File

@@ -0,0 +1,15 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test for debugger stream cache."""

+ 77
- 0
tests/ut/debugger/stream_cache/test_node_type_identifier.py View File

@@ -0,0 +1,77 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Function:
Test query debugger node type identifier.
Usage:
pytest tests/ut/debugger
"""
from unittest.mock import MagicMock

import pytest

from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.stream_cache.node_type_identifier import NodeTypeIdentifier


class TestNodeTypeIdentifier:
"""Test NodeTypeIdentifier."""

@pytest.mark.parametrize("name, node_type, result", [
('Default/mock/node_name.bias', "Parameter", True),
('Default/mock/node_name.weight', "Parameter", True),
('Gradients/mock/node_name.bias', "Parameter", False),
('Default/optimizer-mock/node_name.bias', "Parameter", False),
])
def test_weight_node(self, name, node_type, result):
"""Test weight node."""
identifier = NodeTypeIdentifier('weight')
mock_node = MagicMock(type=node_type)
mock_node.name = name
res = identifier.is_match(mock_node)
assert res is result

@pytest.mark.parametrize("name, node_type, result", [
('Default/mock/node_name.bias', "Parameter", False),
('Gradients/mock/node_name.bias', "Parameter", False),
('Gradients/mock-mock/node_name.bias', "ReluGrad", True),
])
def test_gradient_node(self, name, node_type, result):
"""Test gradient node."""
identifier = NodeTypeIdentifier('gradient')
mock_node = MagicMock(type=node_type)
mock_node.name = name
res = identifier.is_match(mock_node)
assert res is result

@pytest.mark.parametrize("name, node_type, condition, result", [
('Default/mock/relu_ReLU-op11', "ReLU", None, True),
('Gradients/mock/relu_ReLU-op11', "ReLU", None, False),
('Default/mock/relu_ReLU-op11', "Parameter", None, False),
('Default/mock/relu_ReLU-op11', "ReLU", {'activation_func': 'Softmax'}, False),
('Default/mock/relu_ReLU-op11', "Softmax", {'activation_func': ['ReLU', 'Softmax']}, True)
])
def test_activate_node(self, name, node_type, condition, result):
"""Test activate node."""
identifier = NodeTypeIdentifier('activation')
mock_node = MagicMock(type=node_type)
mock_node.name = name
res = identifier.is_match(mock_node, condition)
assert res is result

def test_invalid_func(self):
"""Test invalid func."""
with pytest.raises(DebuggerParamValueError, match='Invalid identify type.'):
NodeTypeIdentifier('invalid_type')

+ 27
- 4
tests/ut/debugger/stream_handler/test_graph_handler.py View File

@@ -22,7 +22,7 @@ import os

import pytest

from tests.ut.debugger.configurations import init_graph_handler
from tests.ut.debugger.configurations import init_graph_handler, compare_debugger_result_with_file
from tests.utils.tools import compare_result_with_file


@@ -46,11 +46,12 @@ class TestGraphHandler:
"""Test get."""
result = self.graph_handler.get(filter_condition)
file_path = os.path.join(self.graph_results_dir, result_file)
compare_debugger_result_with_file(result, file_path, True)
compare_result_with_file(result, file_path)

@pytest.mark.parametrize("node_name, result_file", [
("Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190",
"tenor_hist_0.json"),
"tensor_hist_0.json"),
("Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op22",
"tensor_hist_1.json")
])
@@ -66,10 +67,22 @@ class TestGraphHandler:
])
def test_search_nodes(self, pattern, result_file):
"""Test search nodes."""
result = self.graph_handler.search_nodes(pattern)
result = self.graph_handler.search_nodes({'name': pattern})
file_path = os.path.join(self.graph_results_dir, result_file)
compare_result_with_file(result, file_path)

@pytest.mark.parametrize("node_type, condition, result_file", [
("weight", None, "search_nodes_by_type_0.json"),
("activation", {'activation_func': ['ReLU', 'Softmax']}, "search_nodes_by_type_1.json")
])
def test_search_nodes_by_type(self, node_type, condition, result_file):
"""Test search nodes by type."""
search_nodes = self.graph_handler.get_searched_node_list(
{'node_category': node_type, 'condition': condition}, 'kernel_graph_0')
file_path = os.path.join(self.graph_results_dir, result_file)
result = {'node_names': [node.name for node in search_nodes]}
compare_result_with_file(result, file_path)

@pytest.mark.parametrize("node_name, expect_type", [
("Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/cst1", 'Const'),
("Default/TransData-op99", "TransData")
@@ -96,7 +109,7 @@ class TestGraphHandler:
])
def test_get_node_name_by_full_name(self, full_name, expect_node_name):
"""Test get node name by full name."""
node_name = self.graph_handler.get_node_name_by_full_name(full_name)
node_name = self.graph_handler.get_node_name_by_full_name(full_name, 'kernel_graph_0')
assert node_name == expect_node_name

@pytest.mark.parametrize("node_name, ascend, expect_next", [
@@ -112,3 +125,13 @@ class TestGraphHandler:
"""Test get node by BFS order."""
next_node = self.graph_handler.get_node_by_bfs_order(node_name, ascend)
assert next_node == expect_next

@pytest.mark.parametrize("tensor_name, expect_file", [
("Default/network-WithLossCell/_loss_fn-SoftmaxCrossEntropyWithLogits/OneHot-op0:0", "get_tensor_graph-0.json"),
("Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/ReLUV2-op89:1", "get_tensor_graph-1.json"),
("Default/tuple_getitem[10]_0/tuple_getitem-op206:1", "get_tensor_graph-2.json"),
])
def test_get_tensor_graph(self, tensor_name, expect_file):
"""Test get tensor graph."""
res = self.graph_handler.get_tensor_graph(tensor_name, None)
compare_debugger_result_with_file(res, expect_file=os.path.join('graph', expect_file))

+ 1
- 30
tests/ut/debugger/stream_handler/test_tensor_handler.py View File

@@ -14,11 +14,10 @@
# ============================================================================
"""Test tensor_handler.py"""
from unittest import mock
from unittest.mock import MagicMock
import pytest

from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.stream_handler.tensor_handler import TensorHandler


@@ -41,34 +40,6 @@ class TestTensorHandler:
self.tensor_handler.get(filter_condition)
assert "No tensor named {}".format(None) in str(ex.value)

@mock.patch.object(TensorHandler, '_get_prev_tensor_value_status')
@pytest.mark.parametrize(
"node_type, tensor_name, tensor_info", [('Parameter', 'name', {'full_name': 'name', 'step': 1})])
def test_update_has_prev_step_field(self, mock_get_pre, node_type, tensor_name, tensor_info):
"""Test update has_prev_step field in tensor info."""
mock_get_pre.return_value = True
res = self.tensor_handler._update_has_prev_step_field(tensor_info, tensor_name, node_type)
assert res

def test_get_prev_tensor_value_status_none(self):
"""
test _get_prev_tensor_value_status.
"""
res = self.tensor_handler._get_prev_tensor_value_status('tensor_name')
assert res is None

@mock.patch.object(TensorHandler, '_get_tensor')
def test_get_prev_tensor_value_status_false(self, mock_get_tensor):
"""
test _get_prev_tensor_value_status.
"""
self.tensor_handler._cur_step = 1
mock_tensor = MagicMock()
mock_tensor.value = None
mock_get_tensor.return_value = mock_tensor
res = self.tensor_handler._get_prev_tensor_value_status('tensor_name')
assert not res

def test_get_tensor_value_by_name_none(self):
"""Test get_tensor_value_by_name."""
res = self.tensor_handler.get_tensor_value_by_name('tensor_name', True)


+ 67
- 43
tests/ut/debugger/stream_handler/test_watchpoint_handler.py View File

@@ -22,47 +22,56 @@ import json
import os
from unittest import mock, TestCase

from google.protobuf import json_format
import pytest
from google.protobuf import json_format

from mindinsight.conditionmgr.conditionmgr import ConditionMgr
from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError, \
DebuggerParamTypeError
from mindinsight.debugger.common.log import logger as log
from mindinsight.debugger.common.log import LOGGER as log
from mindinsight.debugger.stream_cache.watchpoint import Watchpoint
from mindinsight.debugger.stream_handler.watchpoint_handler import WatchpointHandler, \
WatchpointHitHandler, validate_watch_condition, validate_watch_condition_params

from tests.ut.debugger.configurations import init_graph_handler, mock_tensor_proto, \
mock_tensor_history, get_node_basic_infos, get_watch_nodes_by_search, \
mock_tensor_history, get_node_basic_infos, \
init_watchpoint_hit_handler
from tests.utils.tools import compare_result_with_file


class TestWatchpointHandler:
"""Test WatchpointHandler."""

@classmethod
def setup_class(cls):
"""Init WatchpointHandler for watchpoint unittest."""
cls.handler = WatchpointHandler()
cls.results_dir = os.path.join(os.path.dirname(__file__),
'../expected_results/watchpoint')
cls.graph_results_dir = os.path.join(os.path.dirname(__file__),
'../expected_results/graph')
cls.graph_stream = init_graph_handler()
cls.conditionmgr = None
cls.handler = None

@pytest.mark.parametrize(
"watch_condition, watch_nodes, watch_point_id, expect_new_id", [
({'condition': 'INF'}, None, None, 1),
({'condition': 'INF'}, ["Default"], None, 2),
({'condition': 'MAX_GT', 'param': 1},
["Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92"], None, 3)
])
def test_create_watchpoint(self, watch_condition, watch_nodes,
watch_point_id, expect_new_id):
def setup_method(self):
"""Init watchpoint for each unittest."""
self.conditionmgr = ConditionMgr()
self.handler = WatchpointHandler()
self._create_watchpoint()

def _create_watchpoint(self):
"""Test create_watchpoint."""
watch_nodes = get_node_basic_infos(watch_nodes)
watch_point_id = self.handler.create_watchpoint(watch_condition, watch_nodes, watch_point_id)
assert watch_point_id == expect_new_id
watchpoints = [
({'id': 'inf', 'params': []}, None, None, 1),
({'id': 'inf', 'params': []}, ["Default"], None, 2),
({'id': 'max_gt', 'params': [{'name': 'param', 'value': 1, 'disable': False}]},
["Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92"],
None, 3)
]
for watch_condition, watch_nodes, watch_point_id, expect_new_id in watchpoints:
watch_nodes = get_node_basic_infos(watch_nodes)
watch_point_id = self.handler.create_watchpoint(self.conditionmgr, watch_condition, watch_nodes,
watch_point_id)
assert watch_point_id == expect_new_id

@pytest.mark.parametrize(
"watch_point_id, watch_nodes, watched, expect_updated_id", [
@@ -84,31 +93,28 @@ class TestWatchpointHandler:
])
def test_update_watchpoint_delete(self, watch_point_id, watch_nodes, watched, expect_updated_id):
"""Test update_watchpoint on deletion."""
watch_nodes = get_watch_nodes_by_search(watch_nodes)
watch_nodes = get_node_basic_infos(watch_nodes)
with TestCase().assertLogs(logger=log, level='DEBUG') as log_content:
self.handler.update_watchpoint(watch_point_id, watch_nodes, watched)
TestCase().assertIn(f"DEBUG:debugger.debugger:Update watchpoint {expect_updated_id} in cache.",
log_content.output)

@pytest.mark.parametrize("filter_condition, result_file", [
(True, 'watchpoint_handler_get_0.json')
])
def test_get_filter_true(self, filter_condition, result_file):
def test_get_pending_commands(self):
"""Test get with filter_condition is True."""
result_file = 'watchpoint_handler_get_0.json'
file_path = os.path.join(self.results_dir, result_file)
with open(file_path, 'r') as f:
contents = json.load(f)

reply = self.handler.get(filter_condition)
protos = reply.get('watch_points')
with open(file_path, 'r') as file_handler:
contents = json.load(file_handler)
protos = self.handler.get_pending_commands(self.graph_stream)
for proto in protos:
msg_dict = json_format.MessageToDict(proto)
msg_dict['watch_nodes_num'] = len(msg_dict.pop('watchNodes', []))
assert msg_dict in contents

@pytest.mark.parametrize("filter_condition, result_file", [
(False, 'watchpoint_handler_get_1.json')
(None, 'watchpoint_handler_get_1.json')
])
def test_get_filter_false(self, filter_condition, result_file):
def test_get_without_filter(self, filter_condition, result_file):
"""Test get with filer_condition is False."""
file_path = os.path.join(self.results_dir, result_file)
reply = self.handler.get(filter_condition)
@@ -121,7 +127,7 @@ class TestWatchpointHandler:
with pytest.raises(DebuggerParamValueError) as err:
self.handler.get_watchpoint_by_id(watchpoint_id)
assert err.value.error_code == '5054B081'
assert err.value.message == f"ValueError. Invalid watchpoint id {watchpoint_id}"
assert err.value.message == f"ValueError. Invalid watchpoint id: {watchpoint_id}"

@pytest.mark.parametrize("graph_file, watch_point_id", [
('graph_handler_get_3_single_node.json', 4)
@@ -129,20 +135,37 @@ class TestWatchpointHandler:
def test_set_watch_nodes(self, graph_file, watch_point_id):
"""Test set_watch_nodes."""
path = os.path.join(self.graph_results_dir, graph_file)
with open(path, 'r') as f:
graph = json.load(f)
with open(path, 'r') as file_handler:
graph = json.load(file_handler)
self.handler.set_watch_nodes(graph, self.graph_stream, watch_point_id)

@pytest.mark.parametrize(
"watch_point_id, expect_deleted_ids", [
(3, 3), (2, 2)
(3, 3), (None, 2)
])
def test_delete_watchpoint(self, watch_point_id, expect_deleted_ids):
"""Test delete_watchpoint."""
self.handler.sync_set_cmd({})
with TestCase().assertLogs(logger=log, level='DEBUG') as log_content:
self.handler.delete_watchpoint(watch_point_id)
TestCase().assertIn(f"DEBUG:debugger.debugger:Delete watchpoint {expect_deleted_ids} in cache.",
log_content.output)
TestCase().assertIn(
f"DEBUG:debugger.debugger:Delete watchpoint {expect_deleted_ids} in cache.",
log_content.output)

@pytest.mark.parametrize(
"watch_point_id, expect_deleted_ids", [
(3, 3), (2, 2)
])
def test_delete_watchpoint_in_cache(self, watch_point_id,
expect_deleted_ids):
"""Test delete_watchpoint."""
for _ in range(watch_point_id):
self.handler.create_watchpoint(self.conditionmgr, {'id': 'inf', 'param': []})
with TestCase().assertLogs(logger=log, level='DEBUG') as log_content:
self.handler.delete_watchpoint(watch_point_id)
TestCase().assertIn(
f"DEBUG:debugger.debugger:Cancel create watchpoint {expect_deleted_ids} in cache.",
log_content.output)


class TestWatchpointHitHandler:
@@ -155,8 +178,7 @@ class TestWatchpointHitHandler:
'tensor_proto': mock_tensor_proto(),
'watchpoint': watchpoint,
'node_name': 'Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92',
'finished': True,
'slot': 0
'graph_name': 'kernel_graph_0',
}

@classmethod
@@ -198,24 +220,26 @@ class TestWatchpointHitHandler:
def test_validate_watch_condition_type_error():
"""Test validate_watch_condition."""
watch_condition = []
conditionmgr = ConditionMgr()
with pytest.raises(DebuggerParamTypeError) as err:
validate_watch_condition(watch_condition)
validate_watch_condition(conditionmgr, watch_condition)
assert err.value.error_code == '5054B080'

watch_condition = {'watch_condition': {'condition': 'MAXIMUM'}}
with pytest.raises(DebuggerParamValueError) as err:
validate_watch_condition(watch_condition)
validate_watch_condition(conditionmgr, watch_condition)
assert err.value.error_code == '5054B081'


def test_validate_watch_condition_params_except():
"""Test validate_watch_condition_params."""
watch_condition = {'watch_condition': {'condition': 'NAN', 'param': 1}}
watch_condition = {'id': 'inf', 'params': [{'name': 'param', 'value': 0, 'disable': False}]}
conditionmgr = ConditionMgr()
with pytest.raises(DebuggerParamValueError) as err:
validate_watch_condition_params(watch_condition)
validate_watch_condition_params(conditionmgr, watch_condition)
assert err.value.error_code == '5054B081'

watch_condition = {'watch_condition': {'condition': 'MAX_GT', 'param': '0'}}
watch_condition = {'id': 'max_gt', 'params': [{'name': 'param', 'value': '0', 'disable': False}]}
with pytest.raises(DebuggerParamValueError) as err:
validate_watch_condition_params(watch_condition)
validate_watch_condition_params(conditionmgr, watch_condition)
assert err.value.error_code == '5054B081'

+ 8
- 5
tests/ut/debugger/test_debugger_grpc_server.py View File

@@ -23,6 +23,7 @@ from unittest.mock import MagicMock

import numpy as np

from mindinsight.conditionmgr.conditionmgr import ConditionMgr
from mindinsight.debugger.common.utils import get_ack_reply, ServerStatus
from mindinsight.debugger.debugger_cache import DebuggerCache
from mindinsight.debugger.debugger_grpc_server import DebuggerGrpcServer
@@ -117,7 +118,7 @@ class TestDebuggerGrpcServer:
def setup_method(self):
"""Initialize for each testcase."""
cache_store = DebuggerCache()
self._server = DebuggerGrpcServer(cache_store)
self._server = DebuggerGrpcServer(cache_store, condition_mgr=ConditionMgr())

def test_waitcmd_with_pending_status(self):
"""Test wait command interface when status is pending."""
@@ -125,6 +126,7 @@ class TestDebuggerGrpcServer:
assert res.status == EventReply.Status.FAILED

@mock.patch.object(WatchpointHitHandler, 'empty', False)
@mock.patch.object(WatchpointHitHandler, 'put')
@mock.patch.object(DebuggerGrpcServer, '_deal_with_old_command')
def test_waitcmd_with_old_command(self, *args):
"""Test wait command interface with old command."""
@@ -132,8 +134,8 @@ class TestDebuggerGrpcServer:
args[0].return_value = old_command
setattr(self._server, '_status', ServerStatus.WAITING)
setattr(self._server, '_received_view_cmd', {'node_name': 'mock_node_name'})
setattr(self._server, '_received_hit', True)
res = self._server.WaitCMD(MagicMock(cur_step=1), MagicMock())
setattr(self._server, '_received_hit', [MagicMock()])
res = self._server.WaitCMD(MagicMock(cur_step=1, cur_node=''), MagicMock())
assert res == old_command

@mock.patch.object(DebuggerGrpcServer, '_deal_with_old_command', return_value=None)
@@ -143,7 +145,7 @@ class TestDebuggerGrpcServer:
old_command = MockDataGenerator.get_run_cmd(steps=1)
args[0].return_value = old_command
setattr(self._server, '_status', ServerStatus.WAITING)
res = self._server.WaitCMD(MagicMock(cur_step=1), MagicMock())
res = self._server.WaitCMD(MagicMock(cur_step=1, cur_node=''), MagicMock())
assert res == old_command

@mock.patch.object(DebuggerGrpcServer, '_deal_with_old_command', return_value=None)
@@ -152,7 +154,7 @@ class TestDebuggerGrpcServer:
"""Test wait command interface with next command is None."""
args[0].return_value = None
setattr(self._server, '_status', ServerStatus.RECEIVE_GRAPH)
res = self._server.WaitCMD(MagicMock(cur_step=1), MagicMock())
res = self._server.WaitCMD(MagicMock(cur_step=1, cur_node=''), MagicMock())
assert res == get_ack_reply(1)

@mock.patch.object(DebuggerCache, 'get_command', return_value=(0, None))
@@ -228,6 +230,7 @@ class TestDebuggerGrpcServer:
assert res == get_ack_reply()

@mock.patch.object(WatchpointHandler, 'get_watchpoint_by_id')
@mock.patch.object(GraphHandler, 'get_graph_id_by_full_name', return_value='mock_graph_name')
@mock.patch.object(GraphHandler, 'get_node_name_by_full_name')
def test_send_watchpoint_hit(self, *args):
"""Test SendWatchpointHits interface."""


+ 17
- 12
tests/ut/debugger/test_debugger_server.py View File

@@ -28,6 +28,7 @@ import pytest

from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValueError, \
DebuggerCompareTensorError, DebuggerCreateWatchPointError, DebuggerDeleteWatchPointError
from mindinsight.debugger.common.utils import Streams
from mindinsight.debugger.debugger_cache import DebuggerCache
from mindinsight.debugger.debugger_server import DebuggerServer
from mindinsight.debugger.debugger_server import grpc_server_base
@@ -81,7 +82,7 @@ class TestDebuggerServer:
"""Test search node."""
mock_graph = {'nodes': ['mock_nodes']}
args[0].return_value = mock_graph
res = self._server.search('mock_name')
res = self._server.search({'name': 'mock_name'})
assert res == mock_graph

def test_tensor_comparision_with_wrong_status(self):
@@ -93,6 +94,7 @@ class TestDebuggerServer:

@mock.patch.object(MetadataHandler, 'state', 'waiting')
@mock.patch.object(GraphHandler, 'get_node_type')
@mock.patch.object(GraphHandler, 'get_graph_id_by_name')
@mock.patch.object(GraphHandler, 'get_full_name', return_value='mock_node_name')
def test_tensor_comparision_with_wrong_type(self, *args):
"""Test tensor comparison with wrong type."""
@@ -101,6 +103,7 @@ class TestDebuggerServer:
self._server.tensor_comparisons(name='mock_node_name:0', shape='[:, :]')

@mock.patch.object(MetadataHandler, 'state', 'waiting')
@mock.patch.object(GraphHandler, 'get_graph_id_by_name')
@mock.patch.object(GraphHandler, 'get_node_type', return_value='Parameter')
@mock.patch.object(GraphHandler, 'get_full_name', return_value='mock_node_name')
@mock.patch.object(TensorHandler, 'get_tensors_diff')
@@ -156,7 +159,7 @@ class TestDebuggerServer:
"""Test validate leaf name."""
args[0].return_value = 'name_scope'
with pytest.raises(DebuggerParamValueError, match='Invalid leaf node name.'):
self._server._validate_leaf_name(node_name='mock_node_name')
self._server._validate_leaf_name(node_name='mock_node_name', graph_name='mock_graph_name')

@mock.patch.object(TensorHandler, 'get')
@mock.patch.object(DebuggerServer, '_get_tensor_name_and_type_by_ui_name')
@@ -199,40 +202,42 @@ class TestDebuggerServer:
self._server.create_watchpoint(watch_condition={'condition': 'INF'})

@mock.patch.object(MetadataHandler, 'state', 'waiting')
@mock.patch.object(GraphHandler, 'get_full_name', return_value='mock_full_name')
@mock.patch.object(GraphHandler, 'get_full_name', return_value='mock_full_name')
@mock.patch.object(GraphHandler, 'get_nodes_by_scope', return_value=[MagicMock()])
@mock.patch.object(GraphHandler, 'get_node_basic_info', return_value=[MagicMock()])
@mock.patch.object(GraphHandler, 'get_node_type', return_value='aggregation_scope')
@mock.patch.object(WatchpointHandler, 'create_watchpoint')
def test_create_watchpoint(self, *args):
"""Test create watchpoint."""
args[0].return_value = 1
res = self._server.create_watchpoint({'condition': 'INF'}, ['watch_node_name'])
assert res == {'id': 1}
assert res == {'id': 1, 'metadata': {'enable_recheck': False, 'state': 'waiting'}}

@mock.patch.object(MetadataHandler, 'state', 'waiting')
@mock.patch.object(GraphHandler, 'validate_graph_name', return_value='kernel_graph_0')
@mock.patch.object(GraphHandler, 'get_searched_node_list')
@mock.patch.object(WatchpointHandler, 'validate_watchpoint_id')
@mock.patch.object(WatchpointHandler, 'update_watchpoint')
def test_update_watchpoint(self, *args):
"""Test update watchpoint."""
args[2].return_value = [MagicMock(name='seatch_name/op_name')]
args[2].return_value = [MagicMock(name='search_name/op_name')]
res = self._server.update_watchpoint(
watch_point_id=1, watch_nodes=['search_name'], mode=1, name='search_name')
assert res == {}
watch_point_id=1, watch_nodes=['search_name'],
mode=1, search_pattern={'name': 'search_name'}, graph_name='kernel_graph_0')
assert res == {'metadata': {'enable_recheck': False, 'state': 'waiting'}}

def test_delete_watchpoint_with_wrong_state(self):
"""Test delete watchpoint with wrong state."""
with pytest.raises(DebuggerDeleteWatchPointError, match='Failed to delete watchpoint'):
self._server.delete_watchpoint(watch_point_id=1)

@mock.patch.object(MetadataHandler, 'state', 'waiting')
@mock.patch.object(MetadataHandler, 'enable_recheck', True)
@mock.patch.object(WatchpointHandler, 'is_recheckable', return_value=True)
@mock.patch.object(WatchpointHandler, 'delete_watchpoint')
def test_delete_watchpoint(self, *args):
"""Test delete watchpoint with wrong state."""
self._server.cache_store.get_stream_handler(Streams.METADATA).state = 'waiting'
args[0].return_value = None
res = self._server.delete_watchpoint(1)
assert res == {}
assert res == {'metadata': {'enable_recheck': True, 'state': 'waiting'}}

@pytest.mark.parametrize('mode, cur_state, state', [
('continue', 'waiting', 'running'),
@@ -242,7 +247,7 @@ class TestDebuggerServer:
"""Test control request."""
with mock.patch.object(MetadataHandler, 'state', cur_state):
res = self._server.control({'mode': mode})
assert res == {'metadata': {'state': state}}
assert res == {'metadata': {'enable_recheck': False, 'state': state}}

def test_construct_run_event(self):
"""Test construct run event."""


+ 1
- 2
tests/utils/tools.py View File

@@ -34,11 +34,10 @@ def get_url(url, params):

Args:
url (str): A link requested. For example, http://example.com.
params (dict): A dict consists of params. For example, {'offset': 1, 'limit':'100}.
params (dict): A dict consists of params. For example, {'offset': 1, 'limit': 100}.

Returns:
str, like http://example.com?offset=1&limit=100

"""

return url + '?' + urlencode(params)


Loading…
Cancel
Save