Browse Source

!941 update debugger version from 1.0 to 1.1, delete old version conditions and old condition api and fix bug

From: @jiang-shuqiang
Reviewed-by: @wenkai_dist,@ouwenchang
Signed-off-by: @ouwenchang
tags/v1.1.0
mindspore-ci-bot Gitee 5 years ago
parent
commit
c57e63c3c4
21 changed files with 53 additions and 359 deletions
  1. +0
    -7
      mindinsight/backend/conditionmgr/conditionmgr_api.py
  2. +1
    -1
      mindinsight/conf/constants.py
  3. +0
    -17
      mindinsight/debugger/conditionmgr/condition.py
  4. +5
    -164
      mindinsight/debugger/conditionmgr/condition_list.py
  5. +2
    -26
      mindinsight/debugger/conditionmgr/conditionmgr.py
  6. +5
    -5
      mindinsight/debugger/conditionmgr/recommender.py
  7. +2
    -9
      mindinsight/debugger/debugger_server.py
  8. +0
    -10
      mindinsight/debugger/proto/debug_grpc.proto
  9. +0
    -11
      mindinsight/debugger/stream_cache/watchpoint.py
  10. +1
    -5
      mindinsight/debugger/stream_operator/watchpoint_operator.py
  11. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/create_and_delete_watchpoint.json
  12. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/get_conditions_for_ascend.json
  13. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/get_conditions_for_gpu.json
  14. +1
    -1
      tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json
  15. +1
    -47
      tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json
  16. +17
    -20
      tests/st/func/debugger/test_restful_api.py
  17. +1
    -1
      tests/st/func/debugger/utils.py
  18. +3
    -23
      tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_0.json
  19. +1
    -1
      tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_1.json
  20. +7
    -6
      tests/ut/debugger/stream_handler/test_watchpoint_handler.py
  21. +3
    -2
      tests/ut/debugger/test_debugger_server.py

+ 0
- 7
mindinsight/backend/conditionmgr/conditionmgr_api.py View File

@@ -26,13 +26,6 @@ BLUEPRINT = Blueprint("conditionmgr", __name__,
url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX)


@BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/conditions", methods=["GET"])
def get_conditions(train_id):
"""get conditions"""
reply = _wrap_reply(BACKEND_SERVER.get_conditions, train_id)
return reply


@BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/condition-collections", methods=["GET"])
def get_condition_collections(train_id):
"""get condition collections"""


+ 1
- 1
mindinsight/conf/constants.py View File

@@ -64,4 +64,4 @@ MAX_HISTOGRAM_STEP_SIZE_PER_TAG = 50
MAX_TENSOR_STEP_SIZE_PER_TAG = 20
MAX_TENSOR_RESPONSE_DATA_SIZE = 100000

ENABLE_RECOMMENDED_WATCHPOINTS = False
ENABLE_RECOMMENDED_WATCHPOINTS = True

+ 0
- 17
mindinsight/debugger/conditionmgr/condition.py View File

@@ -18,8 +18,6 @@ Management of all conditions.
This module is used to register all conditions, as well as their parameters.
This module also provide the available conditions to condition_collections api.
"""
import math

from enum import Enum
from mindinsight.debugger.conditionmgr.log import logger

@@ -35,17 +33,6 @@ class ConditionIdEnum(Enum):
GRADIENT_EXPLODING = "gradient_exploding"
TENSOR_OVERFLOW = "tensor_overflow"
OPERATOR_OVERFLOW = "operator_overflow"
NAN = "nan"
OVERFLOW_ASCEND_CHIP = "overflow"
INF = "inf"
MAX_GT = "max_gt"
MAX_LT = "max_lt"
MIN_GT = "min_gt"
MIN_LT = "min_lt"
MAX_MIN_GT = "max_min_gt"
MAX_MIN_LT = "max_min_lt"
MEAN_GT = "mean_gt"
MEAN_LT = "mean_lt"
TENSOR_INITIALIZATION = "tensor_initialization"
TENSOR_TOO_LARGE = "tensor_too_large"
TENSOR_TOO_SMALL = "tensor_too_small"
@@ -287,7 +274,3 @@ def check_abs_param_range(value):
if 0 <= value < float("inf"):
return True
return False


def check_not_nan(value):
return not math.isnan(value)

+ 5
- 164
mindinsight/debugger/conditionmgr/condition_list.py View File

@@ -29,7 +29,6 @@ from mindinsight.debugger.conditionmgr.condition import check_initialization_ava
from mindinsight.debugger.conditionmgr.condition import check_normal_param_range
from mindinsight.debugger.conditionmgr.condition import check_percentage_param_range
from mindinsight.debugger.conditionmgr.condition import check_abs_param_range
from mindinsight.debugger.conditionmgr.condition import check_not_nan


CONDITION_LIST = [
@@ -67,7 +66,7 @@ CONDITION_LIST = [
# Send this condition to MindSpore will use WatchCondition.Condition.tensor_general_overflow
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_target_type=TargetTypeEnum.WEIGHT,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 1)
),
@@ -225,164 +224,6 @@ CONDITION_LIST = [
supported_platforms=(PlatformEnum.ASCEND,),
minimum_debugger_capability=(1, 1)
),
Condition(
condition_id=ConditionIdEnum.NAN,
abbr="NAN",
# Send this condition to MindSpore will use WatchCondition.Condition.nan
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.GPU,),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.OVERFLOW_ASCEND_CHIP,
abbr="OVERFLOW",
# Send this condition to MindSpore will use WatchCondition.Condition.overflow
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND,),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.INF,
abbr="INF",
# Send this condition to MindSpore will use WatchCondition.Condition.inf
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MAX_GT,
abbr="MAX>",
# Send this condition to MindSpore will use WatchCondition.Condition.max_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MAX_LT,
abbr="MAX<",
# Send this condition to MindSpore will use WatchCondition.Condition.max_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MIN_GT,
abbr="MIN>",
# Send this condition to MindSpore will use WatchCondition.Condition.min_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MIN_LT,
abbr="MIN<",
# Send this condition to MindSpore will use WatchCondition.Condition.min_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MAX_MIN_GT,
abbr="MAX-MIN>",
# Send this condition to MindSpore will use WatchCondition.Condition.max_min_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MAX_MIN_LT,
abbr="MAX-Min<",
# Send this condition to MindSpore will use WatchCondition.Condition.max_min_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MEAN_GT,
abbr="MEAN>",
# Send this condition to MindSpore will use WatchCondition.Condition.mean_gt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.MEAN_LT,
abbr="MEAN<",
# Send this condition to MindSpore will use WatchCondition.Condition.mean_lt
optimize_phase=OptimizePhaseEnum.TENSOR_CHECK,
parameters=[
ConditionParameter(
name="param",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_normal_param_range
)
],
supported_target_type=TargetTypeEnum.TENSOR,
supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU),
minimum_debugger_capability=(1, 0)
),
Condition(
condition_id=ConditionIdEnum.TENSOR_INITIALIZATION,
abbr="TI",
@@ -578,13 +419,13 @@ CONDITION_LIST = [
ConditionParameter(
name="range_start_inclusive",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_not_nan,
valid_test_func=check_normal_param_range,
param_type=ParamTypeEnum.SUPPORT_PARAM
),
ConditionParameter(
name="range_end_inclusive",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_not_nan,
valid_test_func=check_normal_param_range,
param_type=ParamTypeEnum.SUPPORT_PARAM
),
ConditionParameter(
@@ -623,13 +464,13 @@ CONDITION_LIST = [
ConditionParameter(
name="range_start_inclusive",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_not_nan,
valid_test_func=check_normal_param_range,
param_type=ParamTypeEnum.SUPPORT_PARAM
),
ConditionParameter(
name="range_end_inclusive",
value_type=ValueTypeEnum.FLOAT64,
valid_test_func=check_not_nan,
valid_test_func=check_normal_param_range,
param_type=ParamTypeEnum.SUPPORT_PARAM
),
ConditionParameter(


+ 2
- 26
mindinsight/debugger/conditionmgr/conditionmgr.py View File

@@ -46,30 +46,6 @@ class ConditionMgr:
for condition in conditions:
self.register_condition(condition)

def get_all(self, condition_context):
"""Get all register conditions."""
conditions = []
for condition in self.conditions.values():
parameters = []
if not condition.is_available(condition_context):
continue
for param in condition.parameters:
if not param.visible_on_ui:
continue
parameters.append({
"name": param.name,
"type": param.type.name,
"support_disable": param.support_disable,
"default_value": param.default_value
})
conditions.append({
"id": condition.id,
"parameters": parameters,
"supported_target_type": condition.supported_target_type.name
})
conditions = sorted(conditions, key=lambda x: x.get('id'))
return {"conditions": conditions}

def get_condition(self, condition_id) -> Condition:
"""Get condition by condition id"""
return self.conditions[condition_id]
@@ -126,9 +102,9 @@ class ConditionMgr:
})

reply = []
self.check_and_sort(collections, TargetTypeEnum.ACTIVATION.value, reply)
self.check_and_sort(collections, TargetTypeEnum.GRADIENT.value, reply)
self.check_and_sort(collections, TargetTypeEnum.TENSOR.value, reply)
self.check_and_sort(collections, TargetTypeEnum.WEIGHT.value, reply)
self.check_and_sort(collections, TargetTypeEnum.ACTIVATION.value, reply)
self.check_and_sort(collections, TargetTypeEnum.GRADIENT.value, reply)

return reply

+ 5
- 5
mindinsight/debugger/conditionmgr/recommender.py View File

@@ -106,7 +106,7 @@ def recommend_watchpoints(condition_mgr: ConditionMgr, graph_stream, condition_c

# add tensor watch points
merged_info = get_basic_node_info(TargetTypeEnum.TENSOR.value, graph_stream)
_recommend_overflow_ascend_chip(merged_info, condition_mgr, watch_points, condition_context)
_recommend_operator_overflow(merged_info, condition_mgr, watch_points, condition_context)
_recommend_tensor_overflow(merged_info, condition_mgr, watch_points, condition_context)
_recommend_tensor_all_zero(merged_info, condition_mgr, watch_points, condition_context)

@@ -165,21 +165,21 @@ def _recommend_tensor_overflow(basic_info_nodes, condition_mgr, watch_points, co
watch_points.append(overflow_watchpoint)


def _recommend_overflow_ascend_chip(basic_info_nodes, condition_mgr, watch_points, condition_context):
def _recommend_operator_overflow(basic_info_nodes, condition_mgr, watch_points, condition_context):
"""Recommend tensor overflow watchpoint."""
if not basic_info_nodes:
return
if not condition_mgr.has_condition(ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, condition_context):
if not condition_mgr.has_condition(ConditionIdEnum.OPERATOR_OVERFLOW.value, condition_context):
return

condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value)
condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.OPERATOR_OVERFLOW.value)
overflow_d_watchpoint = _WatchPointData(
watch_condition={
"condition": condition.id,
"params": []
},
watch_nodes=basic_info_nodes.copy(),
name='recommend_overflow_ascend_chip_watchpoint'
name='recommend_operator_overflow_watchpoint'
)
watch_points.append(overflow_d_watchpoint)



+ 2
- 9
mindinsight/debugger/debugger_server.py View File

@@ -68,17 +68,10 @@ class DebuggerServer:
self.grpc_server_manager = None
self.back_server = None

def get_conditions(self, train_id):
"""Get all default conditions"""
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0))
log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend)
return self.condition_mgr.get_all(condition_context)

def get_condition_collections(self, train_id):
"""Get default condition_collections"""
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0))
condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1))
log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend)
return self.condition_mgr.get_all_collections(condition_context)

@@ -88,7 +81,7 @@ class DebuggerServer:
log.error("Bool param should be given for set_recommended")
raise DebuggerParamValueError("Bool param should be given.")
metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA)
condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0))
condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1))
log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend)
res = metadata_stream.get(['state', 'enable_recheck'])
if set_recommended and not metadata_stream.recommendation_confirmed:


+ 0
- 10
mindinsight/debugger/proto/debug_grpc.proto View File

@@ -91,17 +91,7 @@ message ViewCMD {

message WatchCondition {
enum Condition {
nan = 0;
inf = 1;
overflow = 2;
max_gt = 3;
max_lt = 4;
min_gt = 5;
min_lt = 6;
max_min_gt = 7;
max_min_lt = 8;
mean_gt = 9;
mean_lt = 10;
sd_gt = 11;
sd_lt = 12;
tensor_general_overflow = 13;


+ 0
- 11
mindinsight/debugger/stream_cache/watchpoint.py View File

@@ -29,18 +29,7 @@ WATCHPOINT_CONDITION_MAPPING = {
ConditionIdEnum.GRADIENT_EXPLODING.value: WatchCondition.Condition.tensor_general_overflow,
ConditionIdEnum.GRADIENT_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large,
ConditionIdEnum.GRADIENT_VANISHING.value: WatchCondition.Condition.tensor_too_small,
ConditionIdEnum.INF.value: WatchCondition.Condition.inf,
ConditionIdEnum.MAX_GT.value: WatchCondition.Condition.max_gt,
ConditionIdEnum.MAX_LT.value: WatchCondition.Condition.max_lt,
ConditionIdEnum.MAX_MIN_GT.value: WatchCondition.Condition.max_min_gt,
ConditionIdEnum.MAX_MIN_LT.value: WatchCondition.Condition.max_min_lt,
ConditionIdEnum.MEAN_GT.value: WatchCondition.Condition.mean_gt,
ConditionIdEnum.MEAN_LT.value: WatchCondition.Condition.mean_lt,
ConditionIdEnum.MIN_GT.value: WatchCondition.Condition.min_gt,
ConditionIdEnum.MIN_LT.value: WatchCondition.Condition.min_lt,
ConditionIdEnum.NAN.value: WatchCondition.Condition.nan,
ConditionIdEnum.OPERATOR_OVERFLOW.value: WatchCondition.Condition.overflow,
ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value: WatchCondition.Condition.overflow,
ConditionIdEnum.TENSOR_ALL_ZERO.value: WatchCondition.Condition.tensor_all_zero,
ConditionIdEnum.TENSOR_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization,
ConditionIdEnum.TENSOR_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow,


+ 1
- 5
mindinsight/debugger/stream_operator/watchpoint_operator.py View File

@@ -95,13 +95,9 @@ class WatchpointOperator:
def _validate_watch_condition(self, watch_condition):
"""Validate watch condition."""
metadata_stream = self._metadata_stream
if metadata_stream.backend == 'GPU' and watch_condition.get('id') in (
ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, ConditionIdEnum.OPERATOR_OVERFLOW.value):
if metadata_stream.backend == 'GPU' and watch_condition.get('id') == ConditionIdEnum.OPERATOR_OVERFLOW.value:
log.error("GPU doesn't support overflow watch condition.")
raise DebuggerParamValueError("GPU doesn't support overflow watch condition.")
if metadata_stream.backend == 'Ascend' and watch_condition.get('id') == ConditionIdEnum.NAN.value:
log.error("Ascend doesn't support nan watch condition.")
raise DebuggerParamValueError("Ascend doesn't support nan watch condition.")

def update_watchpoint(self, params):
"""


+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/create_and_delete_watchpoint.json View File

@@ -1 +1 @@
{"watch_points": [{"id": 1, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1.0}], "abbr": "MAX>"}}, {"id": 2, "watch_condition": {"id": "max_lt", "params": [{"name": "param", "value": -1.0}], "abbr": "MAX<"}}, {"id": 3, "watch_condition": {"id": "min_gt", "params": [{"name": "param", "value": 1e+32}], "abbr": "MIN>"}}, {"id": 5, "watch_condition": {"id": "max_min_gt", "params": [{"name": "param", "value": 0}], "abbr": "MAX-MIN>"}}, {"id": 6, "watch_condition": {"id": "max_min_lt", "params": [{"name": "param", "value": 0}], "abbr": "MAX-Min<"}}, {"id": 7, "watch_condition": {"id": "mean_gt", "params": [{"name": "param", "value": 0}], "abbr": "MEAN>"}}, {"id": 8, "watch_condition": {"id": "mean_lt", "params": [{"name": "param", "value": 0}], "abbr": "MEAN<"}}, {"id": 9, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 10, "watch_condition": {"id": "overflow", "params": [], "abbr": "OVERFLOW"}}]}
{"watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0}], "abbr": "TL"}}, {"id": 2, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "max_lt", "value": -1.0}], "abbr": "TS"}}, {"id": 3, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "min_gt", "value": 1e+32}], "abbr": "TL"}}, {"id": 5, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "mean_gt", "value": 0}], "abbr": "TL"}}, {"id": 6, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "mean_lt", "value": 0}], "abbr": "TS"}}]}

+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/get_conditions_for_ascend.json
File diff suppressed because it is too large
View File


+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/get_conditions_for_gpu.json
File diff suppressed because it is too large
View File


+ 1
- 1
tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json View File

@@ -1 +1 @@
{"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}}
{"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}}

+ 1
- 47
tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json View File

@@ -1,47 +1 @@
{
"watch_point_hits": [
{
"node_name": "Default/TransData-op99",
"tensors": [
{
"slot": "0",
"summarized_error_code": 0,
"watch_points": [
{
"id": 1,
"watch_condition": {
"id": "inf",
"params": [],
"abbr": "INF"
},
"error_code": 0
}
]
}
],
"graph_name": "graph_0"
},
{
"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25",
"tensors": [
{
"slot": "0",
"summarized_error_code": 0,
"watch_points": [
{
"id": 1,
"watch_condition": {
"id": "inf",
"params": [],
"abbr": "INF"
},
"error_code": 0
}
]
}
],
"graph_name": "graph_0"
}
],
"outdated": false
}
{"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}], "outdated": false}

+ 17
- 20
tests/st/func/debugger/test_restful_api.py View File

@@ -84,7 +84,7 @@ class TestAscendDebugger:

def test_get_conditions(self, app_client):
"""Test get conditions for ascend."""
url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions'
url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/condition-collections'
body_data = {}
expect_file = 'get_conditions_for_ascend.json'
with self._debugger_client.get_thread_instance():
@@ -131,16 +131,12 @@ class TestAscendDebugger:
with self._debugger_client.get_thread_instance():
check_state(app_client)
conditions = [
{'id': 'max_gt', 'params': [{'name': 'param', 'value': 1.0}]},
{'id': 'max_lt', 'params': [{'name': 'param', 'value': -1.0}]},
{'id': 'min_gt', 'params': [{'name': 'param', 'value': 1e+32}]},
{'id': 'min_lt', 'params': [{'name': 'param', 'value': -1e+32}]},
{'id': 'max_min_gt', 'params': [{'name': 'param', 'value': 0}]},
{'id': 'max_min_lt', 'params': [{'name': 'param', 'value': 0}]},
{'id': 'mean_gt', 'params': [{'name': 'param', 'value': 0}]},
{'id': 'mean_lt', 'params': [{'name': 'param', 'value': 0}]},
{'id': 'inf', 'params': []},
{'id': 'overflow', 'params': []},
{'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
{'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': -1.0}]},
{'id': 'tensor_too_large', 'params': [{'name': 'min_gt', 'value': 1e+32}]},
{'id': 'tensor_too_small', 'params': [{'name': 'min_lt', 'value': -1e+32}]},
{'id': 'tensor_too_large', 'params': [{'name': 'mean_gt', 'value': 0}]},
{'id': 'tensor_too_small', 'params': [{'name': 'mean_lt', 'value': 0}]}
]
for idx, condition in enumerate(conditions):
create_watchpoint(app_client, condition, idx + 1)
@@ -167,7 +163,7 @@ class TestAscendDebugger:
leaf_node_name = 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias'
with self._debugger_client.get_thread_instance():
check_state(app_client)
condition = {'id': 'inf', 'params': []}
condition = {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}
create_watchpoint(app_client, condition, watch_point_id)
# update watchpoint watchpoint list
url = 'update_watchpoint'
@@ -327,7 +323,7 @@ class TestAscendDebugger:
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("url, body_data, enable_recheck", [
('create_watchpoint',
{'condition': {'id': 'inf', 'params': []},
{'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
'watch_nodes': ['Default']}, True),
('update_watchpoint',
{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'],
@@ -434,10 +430,10 @@ class TestGPUDebugger:
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("url, body_data, enable_recheck", [
('create_watchpoint',
{'condition': {'id': 'inf', 'params': []},
{'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
'watch_nodes': ['Default']}, True),
('create_watchpoint',
{'condition': {'id': 'inf', 'params': []},
{'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
'watch_nodes': ['Default/TransData-op99']}, True),
('update_watchpoint',
{'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'],
@@ -472,7 +468,7 @@ class TestGPUDebugger:

def test_get_conditions(self, app_client):
"""Test get conditions for gpu."""
url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions'
url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/condition-collections'
body_data = {}
expect_file = 'get_conditions_for_gpu.json'
with self._debugger_client.get_thread_instance():
@@ -493,7 +489,7 @@ class TestGPUDebugger:
# send recheck when disable to do recheck
get_request_result(app_client, 'recheck', {}, method='post', expect_code=400)
# send recheck when enable to do recheck
create_watchpoint(app_client, {'id': 'inf', 'params': []}, 2)
create_watchpoint(app_client, {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, 2)
res = get_request_result(app_client, 'recheck', {}, method='post')
assert res['metadata']['enable_recheck'] is False

@@ -579,10 +575,10 @@ class TestMultiGraphDebugger:
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
@pytest.mark.parametrize("filter_condition, expect_id", [
({'condition': {'id': 'inf'},
({'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'],
'graph_name': 'graph_0'}, 1),
({'condition': {'id': 'inf'},
({'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
'watch_nodes': ['graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1'],
'graph_name': None}, 1)
])
@@ -665,7 +661,8 @@ def create_watchpoint(app_client, condition, expect_id):
def create_watchpoint_and_wait(app_client):
"""Preparation for recheck."""
check_state(app_client)
create_watchpoint(app_client, condition={'id': 'inf', 'params': []}, expect_id=1)
create_watchpoint(app_client, condition={'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
expect_id=1)
# send run command to get watchpoint hit
url = 'control'
body_data = {'mode': 'continue',


+ 1
- 1
tests/st/func/debugger/utils.py View File

@@ -74,7 +74,7 @@ def send_and_save_result(app_client, url, body_data, file_path, method='post'):

def delete_random_items(res):
"""delete the random items in metadata."""
if res.get('metadata'):
if isinstance(res, dict) and res.get('metadata'):
if res['metadata'].get('ip'):
res['metadata'].pop('ip')
if res['metadata'].get('pos'):


+ 3
- 23
tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_0.json View File

@@ -1,25 +1,5 @@
[
{
"watchCondition": {
"condition": "inf"
},
"id": 1,
"watch_nodes_num": 0
},
{
"watchCondition": {
"condition": "inf"
},
"id": 2,
"watch_nodes_num": 172
},
{
"watchCondition": {
"condition": "max_gt",
"params": [{"name": "param", "value": 1}],
"value": 1
},
"id": 3,
"watch_nodes_num": 1
}
{"watchCondition": {"condition": "tensor_too_small", "value": 1.0, "params": [{"name": "abs_mean_lt", "disabled": true}, {"name": "max_lt", "value": 1.0}, {"name": "min_lt", "disabled": true}, {"name": "mean_lt", "disabled": true}]}, "id": 1, "watch_nodes_num": 0},
{"watchCondition": {"condition": "tensor_too_small", "value": 1.0, "params": [{"name": "abs_mean_lt", "disabled": true}, {"name": "max_lt", "disabled": true}, {"name": "min_lt", "value": 1.0}, {"name": "mean_lt", "disabled": true}]}, "id": 2, "watch_nodes_num": 172},
{"watchCondition": {"condition": "tensor_too_large", "value": 1.0, "params": [{"name": "abs_mean_gt", "disabled": true}, {"name": "max_gt", "value": 1.0}, {"name": "min_gt", "disabled": true}, {"name": "mean_gt", "disabled": true}]}, "id": 3, "watch_nodes_num": 1}
]

+ 1
- 1
tests/ut/debugger/expected_results/watchpoint/watchpoint_handler_get_1.json View File

@@ -1 +1 @@
[{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 2, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 3, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1}], "abbr": "MAX>"}}]
[{"id": 1, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "max_lt", "value": 1.0}], "abbr": "TS"}}, {"id": 2, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "min_lt", "value": 1.0}], "abbr": "TS"}}, {"id": 3, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0}], "abbr": "TL"}}]

+ 7
- 6
tests/ut/debugger/stream_handler/test_watchpoint_handler.py View File

@@ -61,9 +61,9 @@ class TestWatchpointHandler:
def _create_watchpoint(self):
"""Test create_watchpoint."""
watchpoints = [
({'id': 'inf', 'params': []}, None, None, 1),
({'id': 'inf', 'params': []}, ["Default"], None, 2),
({'id': 'max_gt', 'params': [{'name': 'param', 'value': 1}]},
({'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': 1.0}]}, None, None, 1),
({'id': 'tensor_too_small', 'params': [{'name': 'min_lt', 'value': 1.0}]}, ["Default"], None, 2),
({'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
["Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92"],
None, 3)
]
@@ -160,7 +160,8 @@ class TestWatchpointHandler:
expect_deleted_ids):
"""Test delete_watchpoint."""
for _ in range(watch_point_id):
self.handler.create_watchpoint(self.conditionmgr, {'id': 'inf', 'param': []})
self.handler.create_watchpoint(self.conditionmgr,
{'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': 1.0}]})
with TestCase().assertLogs(logger=log, level='DEBUG') as log_content:
self.handler.delete_watchpoint(watch_point_id)
TestCase().assertIn(
@@ -233,13 +234,13 @@ def test_validate_watch_condition_type_error():

def test_validate_watch_condition_params_except():
"""Test validate_watch_condition_params."""
watch_condition = {'id': 'inf', 'params': [{'name': 'param', 'value': 0}]}
watch_condition = {'id': 'weight_overflow', 'params': [{'name': 'param', 'value': 0}]}
conditionmgr = ConditionMgr()
with pytest.raises(DebuggerParamValueError) as err:
validate_watch_condition_params(conditionmgr, watch_condition)
assert err.value.error_code == '5054B081'

watch_condition = {'id': 'max_gt', 'params': [{'name': 'param', 'value': '0'}]}
watch_condition = {'id': 'tensor_overflow', 'params': [{'name': 'param', 'value': '0'}]}
with pytest.raises(DebuggerParamValueError) as err:
validate_watch_condition_params(conditionmgr, watch_condition)
assert err.value.error_code == '5054B081'

+ 3
- 2
tests/ut/debugger/test_debugger_server.py View File

@@ -199,8 +199,9 @@ class TestDebuggerServer:
def test_create_watchpoint(self, *args):
"""Test create watchpoint."""
args[0].return_value = 1
res = self._server.create_watchpoint({'watch_condition': {'id': 'inf'},
'watch_nodes': ['watch_node_name']})
res = self._server.create_watchpoint(
{'watch_condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]},
'watch_nodes': ['watch_node_name']})
assert res == {'id': 1, 'metadata': {'enable_recheck': False, 'state': 'waiting'}}

@mock.patch.object(MetadataHandler, 'state', 'waiting')


Loading…
Cancel
Save