From: @jiang-shuqiang Reviewed-by: @wenkai_dist,@ouwenchang Signed-off-by: @ouwenchangtags/v1.1.0
| @@ -26,13 +26,6 @@ BLUEPRINT = Blueprint("conditionmgr", __name__, | |||||
| url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX) | url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX) | ||||
| @BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/conditions", methods=["GET"]) | |||||
| def get_conditions(train_id): | |||||
| """get conditions""" | |||||
| reply = _wrap_reply(BACKEND_SERVER.get_conditions, train_id) | |||||
| return reply | |||||
| @BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/condition-collections", methods=["GET"]) | @BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/condition-collections", methods=["GET"]) | ||||
| def get_condition_collections(train_id): | def get_condition_collections(train_id): | ||||
| """get condition collections""" | """get condition collections""" | ||||
| @@ -64,4 +64,4 @@ MAX_HISTOGRAM_STEP_SIZE_PER_TAG = 50 | |||||
| MAX_TENSOR_STEP_SIZE_PER_TAG = 20 | MAX_TENSOR_STEP_SIZE_PER_TAG = 20 | ||||
| MAX_TENSOR_RESPONSE_DATA_SIZE = 100000 | MAX_TENSOR_RESPONSE_DATA_SIZE = 100000 | ||||
| ENABLE_RECOMMENDED_WATCHPOINTS = False | |||||
| ENABLE_RECOMMENDED_WATCHPOINTS = True | |||||
| @@ -18,8 +18,6 @@ Management of all conditions. | |||||
| This module is used to register all conditions, as well as their parameters. | This module is used to register all conditions, as well as their parameters. | ||||
| This module also provide the available conditions to condition_collections api. | This module also provide the available conditions to condition_collections api. | ||||
| """ | """ | ||||
| import math | |||||
| from enum import Enum | from enum import Enum | ||||
| from mindinsight.debugger.conditionmgr.log import logger | from mindinsight.debugger.conditionmgr.log import logger | ||||
| @@ -35,17 +33,6 @@ class ConditionIdEnum(Enum): | |||||
| GRADIENT_EXPLODING = "gradient_exploding" | GRADIENT_EXPLODING = "gradient_exploding" | ||||
| TENSOR_OVERFLOW = "tensor_overflow" | TENSOR_OVERFLOW = "tensor_overflow" | ||||
| OPERATOR_OVERFLOW = "operator_overflow" | OPERATOR_OVERFLOW = "operator_overflow" | ||||
| NAN = "nan" | |||||
| OVERFLOW_ASCEND_CHIP = "overflow" | |||||
| INF = "inf" | |||||
| MAX_GT = "max_gt" | |||||
| MAX_LT = "max_lt" | |||||
| MIN_GT = "min_gt" | |||||
| MIN_LT = "min_lt" | |||||
| MAX_MIN_GT = "max_min_gt" | |||||
| MAX_MIN_LT = "max_min_lt" | |||||
| MEAN_GT = "mean_gt" | |||||
| MEAN_LT = "mean_lt" | |||||
| TENSOR_INITIALIZATION = "tensor_initialization" | TENSOR_INITIALIZATION = "tensor_initialization" | ||||
| TENSOR_TOO_LARGE = "tensor_too_large" | TENSOR_TOO_LARGE = "tensor_too_large" | ||||
| TENSOR_TOO_SMALL = "tensor_too_small" | TENSOR_TOO_SMALL = "tensor_too_small" | ||||
| @@ -287,7 +274,3 @@ def check_abs_param_range(value): | |||||
| if 0 <= value < float("inf"): | if 0 <= value < float("inf"): | ||||
| return True | return True | ||||
| return False | return False | ||||
| def check_not_nan(value): | |||||
| return not math.isnan(value) | |||||
| @@ -29,7 +29,6 @@ from mindinsight.debugger.conditionmgr.condition import check_initialization_ava | |||||
| from mindinsight.debugger.conditionmgr.condition import check_normal_param_range | from mindinsight.debugger.conditionmgr.condition import check_normal_param_range | ||||
| from mindinsight.debugger.conditionmgr.condition import check_percentage_param_range | from mindinsight.debugger.conditionmgr.condition import check_percentage_param_range | ||||
| from mindinsight.debugger.conditionmgr.condition import check_abs_param_range | from mindinsight.debugger.conditionmgr.condition import check_abs_param_range | ||||
| from mindinsight.debugger.conditionmgr.condition import check_not_nan | |||||
| CONDITION_LIST = [ | CONDITION_LIST = [ | ||||
| @@ -67,7 +66,7 @@ CONDITION_LIST = [ | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.tensor_general_overflow | # Send this condition to MindSpore will use WatchCondition.Condition.tensor_general_overflow | ||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | ||||
| parameters=[], | parameters=[], | ||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_target_type=TargetTypeEnum.WEIGHT, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | ||||
| minimum_debugger_capability=(1, 1) | minimum_debugger_capability=(1, 1) | ||||
| ), | ), | ||||
| @@ -225,164 +224,6 @@ CONDITION_LIST = [ | |||||
| supported_platforms=(PlatformEnum.ASCEND,), | supported_platforms=(PlatformEnum.ASCEND,), | ||||
| minimum_debugger_capability=(1, 1) | minimum_debugger_capability=(1, 1) | ||||
| ), | ), | ||||
| Condition( | |||||
| condition_id=ConditionIdEnum.NAN, | |||||
| abbr="NAN", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.nan | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.GPU,), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.OVERFLOW_ASCEND_CHIP, | |||||
| abbr="OVERFLOW", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.overflow | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND,), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.INF, | |||||
| abbr="INF", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.inf | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MAX_GT, | |||||
| abbr="MAX>", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_gt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MAX_LT, | |||||
| abbr="MAX<", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_lt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MIN_GT, | |||||
| abbr="MIN>", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.min_gt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MIN_LT, | |||||
| abbr="MIN<", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.min_lt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MAX_MIN_GT, | |||||
| abbr="MAX-MIN>", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_min_gt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MAX_MIN_LT, | |||||
| abbr="MAX-Min<", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_min_lt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MEAN_GT, | |||||
| abbr="MEAN>", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.mean_gt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | |||||
| condition_id=ConditionIdEnum.MEAN_LT, | |||||
| abbr="MEAN<", | |||||
| # Send this condition to MindSpore will use WatchCondition.Condition.mean_lt | |||||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||||
| parameters=[ | |||||
| ConditionParameter( | |||||
| name="param", | |||||
| value_type=ValueTypeEnum.FLOAT64, | |||||
| valid_test_func=check_normal_param_range | |||||
| ) | |||||
| ], | |||||
| supported_target_type=TargetTypeEnum.TENSOR, | |||||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||||
| minimum_debugger_capability=(1, 0) | |||||
| ), | |||||
| Condition( | Condition( | ||||
| condition_id=ConditionIdEnum.TENSOR_INITIALIZATION, | condition_id=ConditionIdEnum.TENSOR_INITIALIZATION, | ||||
| abbr="TI", | abbr="TI", | ||||
| @@ -578,13 +419,13 @@ CONDITION_LIST = [ | |||||
| ConditionParameter( | ConditionParameter( | ||||
| name="range_start_inclusive", | name="range_start_inclusive", | ||||
| value_type=ValueTypeEnum.FLOAT64, | value_type=ValueTypeEnum.FLOAT64, | ||||
| valid_test_func=check_not_nan, | |||||
| valid_test_func=check_normal_param_range, | |||||
| param_type=ParamTypeEnum.SUPPORT_PARAM | param_type=ParamTypeEnum.SUPPORT_PARAM | ||||
| ), | ), | ||||
| ConditionParameter( | ConditionParameter( | ||||
| name="range_end_inclusive", | name="range_end_inclusive", | ||||
| value_type=ValueTypeEnum.FLOAT64, | value_type=ValueTypeEnum.FLOAT64, | ||||
| valid_test_func=check_not_nan, | |||||
| valid_test_func=check_normal_param_range, | |||||
| param_type=ParamTypeEnum.SUPPORT_PARAM | param_type=ParamTypeEnum.SUPPORT_PARAM | ||||
| ), | ), | ||||
| ConditionParameter( | ConditionParameter( | ||||
| @@ -623,13 +464,13 @@ CONDITION_LIST = [ | |||||
| ConditionParameter( | ConditionParameter( | ||||
| name="range_start_inclusive", | name="range_start_inclusive", | ||||
| value_type=ValueTypeEnum.FLOAT64, | value_type=ValueTypeEnum.FLOAT64, | ||||
| valid_test_func=check_not_nan, | |||||
| valid_test_func=check_normal_param_range, | |||||
| param_type=ParamTypeEnum.SUPPORT_PARAM | param_type=ParamTypeEnum.SUPPORT_PARAM | ||||
| ), | ), | ||||
| ConditionParameter( | ConditionParameter( | ||||
| name="range_end_inclusive", | name="range_end_inclusive", | ||||
| value_type=ValueTypeEnum.FLOAT64, | value_type=ValueTypeEnum.FLOAT64, | ||||
| valid_test_func=check_not_nan, | |||||
| valid_test_func=check_normal_param_range, | |||||
| param_type=ParamTypeEnum.SUPPORT_PARAM | param_type=ParamTypeEnum.SUPPORT_PARAM | ||||
| ), | ), | ||||
| ConditionParameter( | ConditionParameter( | ||||
| @@ -46,30 +46,6 @@ class ConditionMgr: | |||||
| for condition in conditions: | for condition in conditions: | ||||
| self.register_condition(condition) | self.register_condition(condition) | ||||
| def get_all(self, condition_context): | |||||
| """Get all register conditions.""" | |||||
| conditions = [] | |||||
| for condition in self.conditions.values(): | |||||
| parameters = [] | |||||
| if not condition.is_available(condition_context): | |||||
| continue | |||||
| for param in condition.parameters: | |||||
| if not param.visible_on_ui: | |||||
| continue | |||||
| parameters.append({ | |||||
| "name": param.name, | |||||
| "type": param.type.name, | |||||
| "support_disable": param.support_disable, | |||||
| "default_value": param.default_value | |||||
| }) | |||||
| conditions.append({ | |||||
| "id": condition.id, | |||||
| "parameters": parameters, | |||||
| "supported_target_type": condition.supported_target_type.name | |||||
| }) | |||||
| conditions = sorted(conditions, key=lambda x: x.get('id')) | |||||
| return {"conditions": conditions} | |||||
| def get_condition(self, condition_id) -> Condition: | def get_condition(self, condition_id) -> Condition: | ||||
| """Get condition by condition id""" | """Get condition by condition id""" | ||||
| return self.conditions[condition_id] | return self.conditions[condition_id] | ||||
| @@ -126,9 +102,9 @@ class ConditionMgr: | |||||
| }) | }) | ||||
| reply = [] | reply = [] | ||||
| self.check_and_sort(collections, TargetTypeEnum.ACTIVATION.value, reply) | |||||
| self.check_and_sort(collections, TargetTypeEnum.GRADIENT.value, reply) | |||||
| self.check_and_sort(collections, TargetTypeEnum.TENSOR.value, reply) | self.check_and_sort(collections, TargetTypeEnum.TENSOR.value, reply) | ||||
| self.check_and_sort(collections, TargetTypeEnum.WEIGHT.value, reply) | self.check_and_sort(collections, TargetTypeEnum.WEIGHT.value, reply) | ||||
| self.check_and_sort(collections, TargetTypeEnum.ACTIVATION.value, reply) | |||||
| self.check_and_sort(collections, TargetTypeEnum.GRADIENT.value, reply) | |||||
| return reply | return reply | ||||
| @@ -106,7 +106,7 @@ def recommend_watchpoints(condition_mgr: ConditionMgr, graph_stream, condition_c | |||||
| # add tensor watch points | # add tensor watch points | ||||
| merged_info = get_basic_node_info(TargetTypeEnum.TENSOR.value, graph_stream) | merged_info = get_basic_node_info(TargetTypeEnum.TENSOR.value, graph_stream) | ||||
| _recommend_overflow_ascend_chip(merged_info, condition_mgr, watch_points, condition_context) | |||||
| _recommend_operator_overflow(merged_info, condition_mgr, watch_points, condition_context) | |||||
| _recommend_tensor_overflow(merged_info, condition_mgr, watch_points, condition_context) | _recommend_tensor_overflow(merged_info, condition_mgr, watch_points, condition_context) | ||||
| _recommend_tensor_all_zero(merged_info, condition_mgr, watch_points, condition_context) | _recommend_tensor_all_zero(merged_info, condition_mgr, watch_points, condition_context) | ||||
| @@ -165,21 +165,21 @@ def _recommend_tensor_overflow(basic_info_nodes, condition_mgr, watch_points, co | |||||
| watch_points.append(overflow_watchpoint) | watch_points.append(overflow_watchpoint) | ||||
| def _recommend_overflow_ascend_chip(basic_info_nodes, condition_mgr, watch_points, condition_context): | |||||
| def _recommend_operator_overflow(basic_info_nodes, condition_mgr, watch_points, condition_context): | |||||
| """Recommend tensor overflow watchpoint.""" | """Recommend tensor overflow watchpoint.""" | ||||
| if not basic_info_nodes: | if not basic_info_nodes: | ||||
| return | return | ||||
| if not condition_mgr.has_condition(ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, condition_context): | |||||
| if not condition_mgr.has_condition(ConditionIdEnum.OPERATOR_OVERFLOW.value, condition_context): | |||||
| return | return | ||||
| condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value) | |||||
| condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.OPERATOR_OVERFLOW.value) | |||||
| overflow_d_watchpoint = _WatchPointData( | overflow_d_watchpoint = _WatchPointData( | ||||
| watch_condition={ | watch_condition={ | ||||
| "condition": condition.id, | "condition": condition.id, | ||||
| "params": [] | "params": [] | ||||
| }, | }, | ||||
| watch_nodes=basic_info_nodes.copy(), | watch_nodes=basic_info_nodes.copy(), | ||||
| name='recommend_overflow_ascend_chip_watchpoint' | |||||
| name='recommend_operator_overflow_watchpoint' | |||||
| ) | ) | ||||
| watch_points.append(overflow_d_watchpoint) | watch_points.append(overflow_d_watchpoint) | ||||
| @@ -68,17 +68,10 @@ class DebuggerServer: | |||||
| self.grpc_server_manager = None | self.grpc_server_manager = None | ||||
| self.back_server = None | self.back_server = None | ||||
| def get_conditions(self, train_id): | |||||
| """Get all default conditions""" | |||||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | |||||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0)) | |||||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | |||||
| return self.condition_mgr.get_all(condition_context) | |||||
| def get_condition_collections(self, train_id): | def get_condition_collections(self, train_id): | ||||
| """Get default condition_collections""" | """Get default condition_collections""" | ||||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | ||||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0)) | |||||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) | |||||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | ||||
| return self.condition_mgr.get_all_collections(condition_context) | return self.condition_mgr.get_all_collections(condition_context) | ||||
| @@ -88,7 +81,7 @@ class DebuggerServer: | |||||
| log.error("Bool param should be given for set_recommended") | log.error("Bool param should be given for set_recommended") | ||||
| raise DebuggerParamValueError("Bool param should be given.") | raise DebuggerParamValueError("Bool param should be given.") | ||||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | ||||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0)) | |||||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) | |||||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | ||||
| res = metadata_stream.get(['state', 'enable_recheck']) | res = metadata_stream.get(['state', 'enable_recheck']) | ||||
| if set_recommended and not metadata_stream.recommendation_confirmed: | if set_recommended and not metadata_stream.recommendation_confirmed: | ||||
| @@ -91,17 +91,7 @@ message ViewCMD { | |||||
| message WatchCondition { | message WatchCondition { | ||||
| enum Condition { | enum Condition { | ||||
| nan = 0; | |||||
| inf = 1; | |||||
| overflow = 2; | overflow = 2; | ||||
| max_gt = 3; | |||||
| max_lt = 4; | |||||
| min_gt = 5; | |||||
| min_lt = 6; | |||||
| max_min_gt = 7; | |||||
| max_min_lt = 8; | |||||
| mean_gt = 9; | |||||
| mean_lt = 10; | |||||
| sd_gt = 11; | sd_gt = 11; | ||||
| sd_lt = 12; | sd_lt = 12; | ||||
| tensor_general_overflow = 13; | tensor_general_overflow = 13; | ||||
| @@ -29,18 +29,7 @@ WATCHPOINT_CONDITION_MAPPING = { | |||||
| ConditionIdEnum.GRADIENT_EXPLODING.value: WatchCondition.Condition.tensor_general_overflow, | ConditionIdEnum.GRADIENT_EXPLODING.value: WatchCondition.Condition.tensor_general_overflow, | ||||
| ConditionIdEnum.GRADIENT_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large, | ConditionIdEnum.GRADIENT_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large, | ||||
| ConditionIdEnum.GRADIENT_VANISHING.value: WatchCondition.Condition.tensor_too_small, | ConditionIdEnum.GRADIENT_VANISHING.value: WatchCondition.Condition.tensor_too_small, | ||||
| ConditionIdEnum.INF.value: WatchCondition.Condition.inf, | |||||
| ConditionIdEnum.MAX_GT.value: WatchCondition.Condition.max_gt, | |||||
| ConditionIdEnum.MAX_LT.value: WatchCondition.Condition.max_lt, | |||||
| ConditionIdEnum.MAX_MIN_GT.value: WatchCondition.Condition.max_min_gt, | |||||
| ConditionIdEnum.MAX_MIN_LT.value: WatchCondition.Condition.max_min_lt, | |||||
| ConditionIdEnum.MEAN_GT.value: WatchCondition.Condition.mean_gt, | |||||
| ConditionIdEnum.MEAN_LT.value: WatchCondition.Condition.mean_lt, | |||||
| ConditionIdEnum.MIN_GT.value: WatchCondition.Condition.min_gt, | |||||
| ConditionIdEnum.MIN_LT.value: WatchCondition.Condition.min_lt, | |||||
| ConditionIdEnum.NAN.value: WatchCondition.Condition.nan, | |||||
| ConditionIdEnum.OPERATOR_OVERFLOW.value: WatchCondition.Condition.overflow, | ConditionIdEnum.OPERATOR_OVERFLOW.value: WatchCondition.Condition.overflow, | ||||
| ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value: WatchCondition.Condition.overflow, | |||||
| ConditionIdEnum.TENSOR_ALL_ZERO.value: WatchCondition.Condition.tensor_all_zero, | ConditionIdEnum.TENSOR_ALL_ZERO.value: WatchCondition.Condition.tensor_all_zero, | ||||
| ConditionIdEnum.TENSOR_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization, | ConditionIdEnum.TENSOR_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization, | ||||
| ConditionIdEnum.TENSOR_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow, | ConditionIdEnum.TENSOR_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow, | ||||
| @@ -95,13 +95,9 @@ class WatchpointOperator: | |||||
| def _validate_watch_condition(self, watch_condition): | def _validate_watch_condition(self, watch_condition): | ||||
| """Validate watch condition.""" | """Validate watch condition.""" | ||||
| metadata_stream = self._metadata_stream | metadata_stream = self._metadata_stream | ||||
| if metadata_stream.backend == 'GPU' and watch_condition.get('id') in ( | |||||
| ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, ConditionIdEnum.OPERATOR_OVERFLOW.value): | |||||
| if metadata_stream.backend == 'GPU' and watch_condition.get('id') == ConditionIdEnum.OPERATOR_OVERFLOW.value: | |||||
| log.error("GPU doesn't support overflow watch condition.") | log.error("GPU doesn't support overflow watch condition.") | ||||
| raise DebuggerParamValueError("GPU doesn't support overflow watch condition.") | raise DebuggerParamValueError("GPU doesn't support overflow watch condition.") | ||||
| if metadata_stream.backend == 'Ascend' and watch_condition.get('id') == ConditionIdEnum.NAN.value: | |||||
| log.error("Ascend doesn't support nan watch condition.") | |||||
| raise DebuggerParamValueError("Ascend doesn't support nan watch condition.") | |||||
| def update_watchpoint(self, params): | def update_watchpoint(self, params): | ||||
| """ | """ | ||||
| @@ -1 +1 @@ | |||||
| {"watch_points": [{"id": 1, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1.0}], "abbr": "MAX>"}}, {"id": 2, "watch_condition": {"id": "max_lt", "params": [{"name": "param", "value": -1.0}], "abbr": "MAX<"}}, {"id": 3, "watch_condition": {"id": "min_gt", "params": [{"name": "param", "value": 1e+32}], "abbr": "MIN>"}}, {"id": 5, "watch_condition": {"id": "max_min_gt", "params": [{"name": "param", "value": 0}], "abbr": "MAX-MIN>"}}, {"id": 6, "watch_condition": {"id": "max_min_lt", "params": [{"name": "param", "value": 0}], "abbr": "MAX-Min<"}}, {"id": 7, "watch_condition": {"id": "mean_gt", "params": [{"name": "param", "value": 0}], "abbr": "MEAN>"}}, {"id": 8, "watch_condition": {"id": "mean_lt", "params": [{"name": "param", "value": 0}], "abbr": "MEAN<"}}, {"id": 9, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 10, "watch_condition": {"id": "overflow", "params": [], "abbr": "OVERFLOW"}}]} | |||||
| {"watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0}], "abbr": "TL"}}, {"id": 2, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "max_lt", "value": -1.0}], "abbr": "TS"}}, {"id": 3, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "min_gt", "value": 1e+32}], "abbr": "TL"}}, {"id": 5, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "mean_gt", "value": 0}], "abbr": "TL"}}, {"id": 6, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "mean_lt", "value": 0}], "abbr": "TS"}}]} | |||||
| @@ -1 +1 @@ | |||||
| {"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} | |||||
| {"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} | |||||
| @@ -1,47 +1 @@ | |||||
| { | |||||
| "watch_point_hits": [ | |||||
| { | |||||
| "node_name": "Default/TransData-op99", | |||||
| "tensors": [ | |||||
| { | |||||
| "slot": "0", | |||||
| "summarized_error_code": 0, | |||||
| "watch_points": [ | |||||
| { | |||||
| "id": 1, | |||||
| "watch_condition": { | |||||
| "id": "inf", | |||||
| "params": [], | |||||
| "abbr": "INF" | |||||
| }, | |||||
| "error_code": 0 | |||||
| } | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "graph_name": "graph_0" | |||||
| }, | |||||
| { | |||||
| "node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", | |||||
| "tensors": [ | |||||
| { | |||||
| "slot": "0", | |||||
| "summarized_error_code": 0, | |||||
| "watch_points": [ | |||||
| { | |||||
| "id": 1, | |||||
| "watch_condition": { | |||||
| "id": "inf", | |||||
| "params": [], | |||||
| "abbr": "INF" | |||||
| }, | |||||
| "error_code": 0 | |||||
| } | |||||
| ] | |||||
| } | |||||
| ], | |||||
| "graph_name": "graph_0" | |||||
| } | |||||
| ], | |||||
| "outdated": false | |||||
| } | |||||
| {"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}], "outdated": false} | |||||
| @@ -84,7 +84,7 @@ class TestAscendDebugger: | |||||
| def test_get_conditions(self, app_client): | def test_get_conditions(self, app_client): | ||||
| """Test get conditions for ascend.""" | """Test get conditions for ascend.""" | ||||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions' | |||||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/condition-collections' | |||||
| body_data = {} | body_data = {} | ||||
| expect_file = 'get_conditions_for_ascend.json' | expect_file = 'get_conditions_for_ascend.json' | ||||
| with self._debugger_client.get_thread_instance(): | with self._debugger_client.get_thread_instance(): | ||||
| @@ -131,16 +131,12 @@ class TestAscendDebugger: | |||||
| with self._debugger_client.get_thread_instance(): | with self._debugger_client.get_thread_instance(): | ||||
| check_state(app_client) | check_state(app_client) | ||||
| conditions = [ | conditions = [ | ||||
| {'id': 'max_gt', 'params': [{'name': 'param', 'value': 1.0}]}, | |||||
| {'id': 'max_lt', 'params': [{'name': 'param', 'value': -1.0}]}, | |||||
| {'id': 'min_gt', 'params': [{'name': 'param', 'value': 1e+32}]}, | |||||
| {'id': 'min_lt', 'params': [{'name': 'param', 'value': -1e+32}]}, | |||||
| {'id': 'max_min_gt', 'params': [{'name': 'param', 'value': 0}]}, | |||||
| {'id': 'max_min_lt', 'params': [{'name': 'param', 'value': 0}]}, | |||||
| {'id': 'mean_gt', 'params': [{'name': 'param', 'value': 0}]}, | |||||
| {'id': 'mean_lt', 'params': [{'name': 'param', 'value': 0}]}, | |||||
| {'id': 'inf', 'params': []}, | |||||
| {'id': 'overflow', 'params': []}, | |||||
| {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| {'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': -1.0}]}, | |||||
| {'id': 'tensor_too_large', 'params': [{'name': 'min_gt', 'value': 1e+32}]}, | |||||
| {'id': 'tensor_too_small', 'params': [{'name': 'min_lt', 'value': -1e+32}]}, | |||||
| {'id': 'tensor_too_large', 'params': [{'name': 'mean_gt', 'value': 0}]}, | |||||
| {'id': 'tensor_too_small', 'params': [{'name': 'mean_lt', 'value': 0}]} | |||||
| ] | ] | ||||
| for idx, condition in enumerate(conditions): | for idx, condition in enumerate(conditions): | ||||
| create_watchpoint(app_client, condition, idx + 1) | create_watchpoint(app_client, condition, idx + 1) | ||||
| @@ -167,7 +163,7 @@ class TestAscendDebugger: | |||||
| leaf_node_name = 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias' | leaf_node_name = 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias' | ||||
| with self._debugger_client.get_thread_instance(): | with self._debugger_client.get_thread_instance(): | ||||
| check_state(app_client) | check_state(app_client) | ||||
| condition = {'id': 'inf', 'params': []} | |||||
| condition = {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]} | |||||
| create_watchpoint(app_client, condition, watch_point_id) | create_watchpoint(app_client, condition, watch_point_id) | ||||
| # update watchpoint watchpoint list | # update watchpoint watchpoint list | ||||
| url = 'update_watchpoint' | url = 'update_watchpoint' | ||||
| @@ -327,7 +323,7 @@ class TestAscendDebugger: | |||||
| @pytest.mark.platform_x86_ascend_training | @pytest.mark.platform_x86_ascend_training | ||||
| @pytest.mark.parametrize("url, body_data, enable_recheck", [ | @pytest.mark.parametrize("url, body_data, enable_recheck", [ | ||||
| ('create_watchpoint', | ('create_watchpoint', | ||||
| {'condition': {'id': 'inf', 'params': []}, | |||||
| {'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| 'watch_nodes': ['Default']}, True), | 'watch_nodes': ['Default']}, True), | ||||
| ('update_watchpoint', | ('update_watchpoint', | ||||
| {'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | {'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | ||||
| @@ -434,10 +430,10 @@ class TestGPUDebugger: | |||||
| @pytest.mark.platform_x86_ascend_training | @pytest.mark.platform_x86_ascend_training | ||||
| @pytest.mark.parametrize("url, body_data, enable_recheck", [ | @pytest.mark.parametrize("url, body_data, enable_recheck", [ | ||||
| ('create_watchpoint', | ('create_watchpoint', | ||||
| {'condition': {'id': 'inf', 'params': []}, | |||||
| {'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| 'watch_nodes': ['Default']}, True), | 'watch_nodes': ['Default']}, True), | ||||
| ('create_watchpoint', | ('create_watchpoint', | ||||
| {'condition': {'id': 'inf', 'params': []}, | |||||
| {'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| 'watch_nodes': ['Default/TransData-op99']}, True), | 'watch_nodes': ['Default/TransData-op99']}, True), | ||||
| ('update_watchpoint', | ('update_watchpoint', | ||||
| {'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | {'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | ||||
| @@ -472,7 +468,7 @@ class TestGPUDebugger: | |||||
| def test_get_conditions(self, app_client): | def test_get_conditions(self, app_client): | ||||
| """Test get conditions for gpu.""" | """Test get conditions for gpu.""" | ||||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions' | |||||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/condition-collections' | |||||
| body_data = {} | body_data = {} | ||||
| expect_file = 'get_conditions_for_gpu.json' | expect_file = 'get_conditions_for_gpu.json' | ||||
| with self._debugger_client.get_thread_instance(): | with self._debugger_client.get_thread_instance(): | ||||
| @@ -493,7 +489,7 @@ class TestGPUDebugger: | |||||
| # send recheck when disable to do recheck | # send recheck when disable to do recheck | ||||
| get_request_result(app_client, 'recheck', {}, method='post', expect_code=400) | get_request_result(app_client, 'recheck', {}, method='post', expect_code=400) | ||||
| # send recheck when enable to do recheck | # send recheck when enable to do recheck | ||||
| create_watchpoint(app_client, {'id': 'inf', 'params': []}, 2) | |||||
| create_watchpoint(app_client, {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, 2) | |||||
| res = get_request_result(app_client, 'recheck', {}, method='post') | res = get_request_result(app_client, 'recheck', {}, method='post') | ||||
| assert res['metadata']['enable_recheck'] is False | assert res['metadata']['enable_recheck'] is False | ||||
| @@ -579,10 +575,10 @@ class TestMultiGraphDebugger: | |||||
| @pytest.mark.platform_x86_gpu_training | @pytest.mark.platform_x86_gpu_training | ||||
| @pytest.mark.platform_x86_ascend_training | @pytest.mark.platform_x86_ascend_training | ||||
| @pytest.mark.parametrize("filter_condition, expect_id", [ | @pytest.mark.parametrize("filter_condition, expect_id", [ | ||||
| ({'condition': {'id': 'inf'}, | |||||
| ({'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | ||||
| 'graph_name': 'graph_0'}, 1), | 'graph_name': 'graph_0'}, 1), | ||||
| ({'condition': {'id': 'inf'}, | |||||
| ({'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| 'watch_nodes': ['graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1'], | 'watch_nodes': ['graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1'], | ||||
| 'graph_name': None}, 1) | 'graph_name': None}, 1) | ||||
| ]) | ]) | ||||
| @@ -665,7 +661,8 @@ def create_watchpoint(app_client, condition, expect_id): | |||||
| def create_watchpoint_and_wait(app_client): | def create_watchpoint_and_wait(app_client): | ||||
| """Preparation for recheck.""" | """Preparation for recheck.""" | ||||
| check_state(app_client) | check_state(app_client) | ||||
| create_watchpoint(app_client, condition={'id': 'inf', 'params': []}, expect_id=1) | |||||
| create_watchpoint(app_client, condition={'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| expect_id=1) | |||||
| # send run command to get watchpoint hit | # send run command to get watchpoint hit | ||||
| url = 'control' | url = 'control' | ||||
| body_data = {'mode': 'continue', | body_data = {'mode': 'continue', | ||||
| @@ -74,7 +74,7 @@ def send_and_save_result(app_client, url, body_data, file_path, method='post'): | |||||
| def delete_random_items(res): | def delete_random_items(res): | ||||
| """delete the random items in metadata.""" | """delete the random items in metadata.""" | ||||
| if res.get('metadata'): | |||||
| if isinstance(res, dict) and res.get('metadata'): | |||||
| if res['metadata'].get('ip'): | if res['metadata'].get('ip'): | ||||
| res['metadata'].pop('ip') | res['metadata'].pop('ip') | ||||
| if res['metadata'].get('pos'): | if res['metadata'].get('pos'): | ||||
| @@ -1,25 +1,5 @@ | |||||
| [ | [ | ||||
| { | |||||
| "watchCondition": { | |||||
| "condition": "inf" | |||||
| }, | |||||
| "id": 1, | |||||
| "watch_nodes_num": 0 | |||||
| }, | |||||
| { | |||||
| "watchCondition": { | |||||
| "condition": "inf" | |||||
| }, | |||||
| "id": 2, | |||||
| "watch_nodes_num": 172 | |||||
| }, | |||||
| { | |||||
| "watchCondition": { | |||||
| "condition": "max_gt", | |||||
| "params": [{"name": "param", "value": 1}], | |||||
| "value": 1 | |||||
| }, | |||||
| "id": 3, | |||||
| "watch_nodes_num": 1 | |||||
| } | |||||
| {"watchCondition": {"condition": "tensor_too_small", "value": 1.0, "params": [{"name": "abs_mean_lt", "disabled": true}, {"name": "max_lt", "value": 1.0}, {"name": "min_lt", "disabled": true}, {"name": "mean_lt", "disabled": true}]}, "id": 1, "watch_nodes_num": 0}, | |||||
| {"watchCondition": {"condition": "tensor_too_small", "value": 1.0, "params": [{"name": "abs_mean_lt", "disabled": true}, {"name": "max_lt", "disabled": true}, {"name": "min_lt", "value": 1.0}, {"name": "mean_lt", "disabled": true}]}, "id": 2, "watch_nodes_num": 172}, | |||||
| {"watchCondition": {"condition": "tensor_too_large", "value": 1.0, "params": [{"name": "abs_mean_gt", "disabled": true}, {"name": "max_gt", "value": 1.0}, {"name": "min_gt", "disabled": true}, {"name": "mean_gt", "disabled": true}]}, "id": 3, "watch_nodes_num": 1} | |||||
| ] | ] | ||||
| @@ -1 +1 @@ | |||||
| [{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 2, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 3, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1}], "abbr": "MAX>"}}] | |||||
| [{"id": 1, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "max_lt", "value": 1.0}], "abbr": "TS"}}, {"id": 2, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "min_lt", "value": 1.0}], "abbr": "TS"}}, {"id": 3, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0}], "abbr": "TL"}}] | |||||
| @@ -61,9 +61,9 @@ class TestWatchpointHandler: | |||||
| def _create_watchpoint(self): | def _create_watchpoint(self): | ||||
| """Test create_watchpoint.""" | """Test create_watchpoint.""" | ||||
| watchpoints = [ | watchpoints = [ | ||||
| ({'id': 'inf', 'params': []}, None, None, 1), | |||||
| ({'id': 'inf', 'params': []}, ["Default"], None, 2), | |||||
| ({'id': 'max_gt', 'params': [{'name': 'param', 'value': 1}]}, | |||||
| ({'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': 1.0}]}, None, None, 1), | |||||
| ({'id': 'tensor_too_small', 'params': [{'name': 'min_lt', 'value': 1.0}]}, ["Default"], None, 2), | |||||
| ({'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| ["Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92"], | ["Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92"], | ||||
| None, 3) | None, 3) | ||||
| ] | ] | ||||
| @@ -160,7 +160,8 @@ class TestWatchpointHandler: | |||||
| expect_deleted_ids): | expect_deleted_ids): | ||||
| """Test delete_watchpoint.""" | """Test delete_watchpoint.""" | ||||
| for _ in range(watch_point_id): | for _ in range(watch_point_id): | ||||
| self.handler.create_watchpoint(self.conditionmgr, {'id': 'inf', 'param': []}) | |||||
| self.handler.create_watchpoint(self.conditionmgr, | |||||
| {'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': 1.0}]}) | |||||
| with TestCase().assertLogs(logger=log, level='DEBUG') as log_content: | with TestCase().assertLogs(logger=log, level='DEBUG') as log_content: | ||||
| self.handler.delete_watchpoint(watch_point_id) | self.handler.delete_watchpoint(watch_point_id) | ||||
| TestCase().assertIn( | TestCase().assertIn( | ||||
| @@ -233,13 +234,13 @@ def test_validate_watch_condition_type_error(): | |||||
| def test_validate_watch_condition_params_except(): | def test_validate_watch_condition_params_except(): | ||||
| """Test validate_watch_condition_params.""" | """Test validate_watch_condition_params.""" | ||||
| watch_condition = {'id': 'inf', 'params': [{'name': 'param', 'value': 0}]} | |||||
| watch_condition = {'id': 'weight_overflow', 'params': [{'name': 'param', 'value': 0}]} | |||||
| conditionmgr = ConditionMgr() | conditionmgr = ConditionMgr() | ||||
| with pytest.raises(DebuggerParamValueError) as err: | with pytest.raises(DebuggerParamValueError) as err: | ||||
| validate_watch_condition_params(conditionmgr, watch_condition) | validate_watch_condition_params(conditionmgr, watch_condition) | ||||
| assert err.value.error_code == '5054B081' | assert err.value.error_code == '5054B081' | ||||
| watch_condition = {'id': 'max_gt', 'params': [{'name': 'param', 'value': '0'}]} | |||||
| watch_condition = {'id': 'tensor_overflow', 'params': [{'name': 'param', 'value': '0'}]} | |||||
| with pytest.raises(DebuggerParamValueError) as err: | with pytest.raises(DebuggerParamValueError) as err: | ||||
| validate_watch_condition_params(conditionmgr, watch_condition) | validate_watch_condition_params(conditionmgr, watch_condition) | ||||
| assert err.value.error_code == '5054B081' | assert err.value.error_code == '5054B081' | ||||
| @@ -199,8 +199,9 @@ class TestDebuggerServer: | |||||
| def test_create_watchpoint(self, *args): | def test_create_watchpoint(self, *args): | ||||
| """Test create watchpoint.""" | """Test create watchpoint.""" | ||||
| args[0].return_value = 1 | args[0].return_value = 1 | ||||
| res = self._server.create_watchpoint({'watch_condition': {'id': 'inf'}, | |||||
| 'watch_nodes': ['watch_node_name']}) | |||||
| res = self._server.create_watchpoint( | |||||
| {'watch_condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||||
| 'watch_nodes': ['watch_node_name']}) | |||||
| assert res == {'id': 1, 'metadata': {'enable_recheck': False, 'state': 'waiting'}} | assert res == {'id': 1, 'metadata': {'enable_recheck': False, 'state': 'waiting'}} | ||||
| @mock.patch.object(MetadataHandler, 'state', 'waiting') | @mock.patch.object(MetadataHandler, 'state', 'waiting') | ||||