From: @jiang-shuqiang Reviewed-by: @wenkai_dist,@ouwenchang Signed-off-by: @ouwenchangtags/v1.1.0
| @@ -26,13 +26,6 @@ BLUEPRINT = Blueprint("conditionmgr", __name__, | |||
| url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX) | |||
| @BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/conditions", methods=["GET"]) | |||
| def get_conditions(train_id): | |||
| """get conditions""" | |||
| reply = _wrap_reply(BACKEND_SERVER.get_conditions, train_id) | |||
| return reply | |||
| @BLUEPRINT.route("/conditionmgr/train-jobs/<train_id>/condition-collections", methods=["GET"]) | |||
| def get_condition_collections(train_id): | |||
| """get condition collections""" | |||
| @@ -64,4 +64,4 @@ MAX_HISTOGRAM_STEP_SIZE_PER_TAG = 50 | |||
| MAX_TENSOR_STEP_SIZE_PER_TAG = 20 | |||
| MAX_TENSOR_RESPONSE_DATA_SIZE = 100000 | |||
| ENABLE_RECOMMENDED_WATCHPOINTS = False | |||
| ENABLE_RECOMMENDED_WATCHPOINTS = True | |||
| @@ -18,8 +18,6 @@ Management of all conditions. | |||
| This module is used to register all conditions, as well as their parameters. | |||
| This module also provide the available conditions to condition_collections api. | |||
| """ | |||
| import math | |||
| from enum import Enum | |||
| from mindinsight.debugger.conditionmgr.log import logger | |||
| @@ -35,17 +33,6 @@ class ConditionIdEnum(Enum): | |||
| GRADIENT_EXPLODING = "gradient_exploding" | |||
| TENSOR_OVERFLOW = "tensor_overflow" | |||
| OPERATOR_OVERFLOW = "operator_overflow" | |||
| NAN = "nan" | |||
| OVERFLOW_ASCEND_CHIP = "overflow" | |||
| INF = "inf" | |||
| MAX_GT = "max_gt" | |||
| MAX_LT = "max_lt" | |||
| MIN_GT = "min_gt" | |||
| MIN_LT = "min_lt" | |||
| MAX_MIN_GT = "max_min_gt" | |||
| MAX_MIN_LT = "max_min_lt" | |||
| MEAN_GT = "mean_gt" | |||
| MEAN_LT = "mean_lt" | |||
| TENSOR_INITIALIZATION = "tensor_initialization" | |||
| TENSOR_TOO_LARGE = "tensor_too_large" | |||
| TENSOR_TOO_SMALL = "tensor_too_small" | |||
| @@ -287,7 +274,3 @@ def check_abs_param_range(value): | |||
| if 0 <= value < float("inf"): | |||
| return True | |||
| return False | |||
| def check_not_nan(value): | |||
| return not math.isnan(value) | |||
| @@ -29,7 +29,6 @@ from mindinsight.debugger.conditionmgr.condition import check_initialization_ava | |||
| from mindinsight.debugger.conditionmgr.condition import check_normal_param_range | |||
| from mindinsight.debugger.conditionmgr.condition import check_percentage_param_range | |||
| from mindinsight.debugger.conditionmgr.condition import check_abs_param_range | |||
| from mindinsight.debugger.conditionmgr.condition import check_not_nan | |||
| CONDITION_LIST = [ | |||
| @@ -67,7 +66,7 @@ CONDITION_LIST = [ | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.tensor_general_overflow | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_target_type=TargetTypeEnum.WEIGHT, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 1) | |||
| ), | |||
| @@ -225,164 +224,6 @@ CONDITION_LIST = [ | |||
| supported_platforms=(PlatformEnum.ASCEND,), | |||
| minimum_debugger_capability=(1, 1) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.NAN, | |||
| abbr="NAN", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.nan | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.GPU,), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.OVERFLOW_ASCEND_CHIP, | |||
| abbr="OVERFLOW", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.overflow | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND,), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.INF, | |||
| abbr="INF", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.inf | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MAX_GT, | |||
| abbr="MAX>", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_gt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MAX_LT, | |||
| abbr="MAX<", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_lt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MIN_GT, | |||
| abbr="MIN>", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.min_gt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MIN_LT, | |||
| abbr="MIN<", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.min_lt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MAX_MIN_GT, | |||
| abbr="MAX-MIN>", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_min_gt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MAX_MIN_LT, | |||
| abbr="MAX-Min<", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.max_min_lt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MEAN_GT, | |||
| abbr="MEAN>", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.mean_gt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.MEAN_LT, | |||
| abbr="MEAN<", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.mean_lt | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="param", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 0) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.TENSOR_INITIALIZATION, | |||
| abbr="TI", | |||
| @@ -578,13 +419,13 @@ CONDITION_LIST = [ | |||
| ConditionParameter( | |||
| name="range_start_inclusive", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_not_nan, | |||
| valid_test_func=check_normal_param_range, | |||
| param_type=ParamTypeEnum.SUPPORT_PARAM | |||
| ), | |||
| ConditionParameter( | |||
| name="range_end_inclusive", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_not_nan, | |||
| valid_test_func=check_normal_param_range, | |||
| param_type=ParamTypeEnum.SUPPORT_PARAM | |||
| ), | |||
| ConditionParameter( | |||
| @@ -623,13 +464,13 @@ CONDITION_LIST = [ | |||
| ConditionParameter( | |||
| name="range_start_inclusive", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_not_nan, | |||
| valid_test_func=check_normal_param_range, | |||
| param_type=ParamTypeEnum.SUPPORT_PARAM | |||
| ), | |||
| ConditionParameter( | |||
| name="range_end_inclusive", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_not_nan, | |||
| valid_test_func=check_normal_param_range, | |||
| param_type=ParamTypeEnum.SUPPORT_PARAM | |||
| ), | |||
| ConditionParameter( | |||
| @@ -46,30 +46,6 @@ class ConditionMgr: | |||
| for condition in conditions: | |||
| self.register_condition(condition) | |||
| def get_all(self, condition_context): | |||
| """Get all register conditions.""" | |||
| conditions = [] | |||
| for condition in self.conditions.values(): | |||
| parameters = [] | |||
| if not condition.is_available(condition_context): | |||
| continue | |||
| for param in condition.parameters: | |||
| if not param.visible_on_ui: | |||
| continue | |||
| parameters.append({ | |||
| "name": param.name, | |||
| "type": param.type.name, | |||
| "support_disable": param.support_disable, | |||
| "default_value": param.default_value | |||
| }) | |||
| conditions.append({ | |||
| "id": condition.id, | |||
| "parameters": parameters, | |||
| "supported_target_type": condition.supported_target_type.name | |||
| }) | |||
| conditions = sorted(conditions, key=lambda x: x.get('id')) | |||
| return {"conditions": conditions} | |||
| def get_condition(self, condition_id) -> Condition: | |||
| """Get condition by condition id""" | |||
| return self.conditions[condition_id] | |||
| @@ -126,9 +102,9 @@ class ConditionMgr: | |||
| }) | |||
| reply = [] | |||
| self.check_and_sort(collections, TargetTypeEnum.ACTIVATION.value, reply) | |||
| self.check_and_sort(collections, TargetTypeEnum.GRADIENT.value, reply) | |||
| self.check_and_sort(collections, TargetTypeEnum.TENSOR.value, reply) | |||
| self.check_and_sort(collections, TargetTypeEnum.WEIGHT.value, reply) | |||
| self.check_and_sort(collections, TargetTypeEnum.ACTIVATION.value, reply) | |||
| self.check_and_sort(collections, TargetTypeEnum.GRADIENT.value, reply) | |||
| return reply | |||
| @@ -106,7 +106,7 @@ def recommend_watchpoints(condition_mgr: ConditionMgr, graph_stream, condition_c | |||
| # add tensor watch points | |||
| merged_info = get_basic_node_info(TargetTypeEnum.TENSOR.value, graph_stream) | |||
| _recommend_overflow_ascend_chip(merged_info, condition_mgr, watch_points, condition_context) | |||
| _recommend_operator_overflow(merged_info, condition_mgr, watch_points, condition_context) | |||
| _recommend_tensor_overflow(merged_info, condition_mgr, watch_points, condition_context) | |||
| _recommend_tensor_all_zero(merged_info, condition_mgr, watch_points, condition_context) | |||
| @@ -165,21 +165,21 @@ def _recommend_tensor_overflow(basic_info_nodes, condition_mgr, watch_points, co | |||
| watch_points.append(overflow_watchpoint) | |||
| def _recommend_overflow_ascend_chip(basic_info_nodes, condition_mgr, watch_points, condition_context): | |||
| def _recommend_operator_overflow(basic_info_nodes, condition_mgr, watch_points, condition_context): | |||
| """Recommend tensor overflow watchpoint.""" | |||
| if not basic_info_nodes: | |||
| return | |||
| if not condition_mgr.has_condition(ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, condition_context): | |||
| if not condition_mgr.has_condition(ConditionIdEnum.OPERATOR_OVERFLOW.value, condition_context): | |||
| return | |||
| condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value) | |||
| condition = condition_mgr.get_condition(condition_id=ConditionIdEnum.OPERATOR_OVERFLOW.value) | |||
| overflow_d_watchpoint = _WatchPointData( | |||
| watch_condition={ | |||
| "condition": condition.id, | |||
| "params": [] | |||
| }, | |||
| watch_nodes=basic_info_nodes.copy(), | |||
| name='recommend_overflow_ascend_chip_watchpoint' | |||
| name='recommend_operator_overflow_watchpoint' | |||
| ) | |||
| watch_points.append(overflow_d_watchpoint) | |||
| @@ -68,17 +68,10 @@ class DebuggerServer: | |||
| self.grpc_server_manager = None | |||
| self.back_server = None | |||
| def get_conditions(self, train_id): | |||
| """Get all default conditions""" | |||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0)) | |||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | |||
| return self.condition_mgr.get_all(condition_context) | |||
| def get_condition_collections(self, train_id): | |||
| """Get default condition_collections""" | |||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0)) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) | |||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | |||
| return self.condition_mgr.get_all_collections(condition_context) | |||
| @@ -88,7 +81,7 @@ class DebuggerServer: | |||
| log.error("Bool param should be given for set_recommended") | |||
| raise DebuggerParamValueError("Bool param should be given.") | |||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 0)) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) | |||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | |||
| res = metadata_stream.get(['state', 'enable_recheck']) | |||
| if set_recommended and not metadata_stream.recommendation_confirmed: | |||
| @@ -91,17 +91,7 @@ message ViewCMD { | |||
| message WatchCondition { | |||
| enum Condition { | |||
| nan = 0; | |||
| inf = 1; | |||
| overflow = 2; | |||
| max_gt = 3; | |||
| max_lt = 4; | |||
| min_gt = 5; | |||
| min_lt = 6; | |||
| max_min_gt = 7; | |||
| max_min_lt = 8; | |||
| mean_gt = 9; | |||
| mean_lt = 10; | |||
| sd_gt = 11; | |||
| sd_lt = 12; | |||
| tensor_general_overflow = 13; | |||
| @@ -29,18 +29,7 @@ WATCHPOINT_CONDITION_MAPPING = { | |||
| ConditionIdEnum.GRADIENT_EXPLODING.value: WatchCondition.Condition.tensor_general_overflow, | |||
| ConditionIdEnum.GRADIENT_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large, | |||
| ConditionIdEnum.GRADIENT_VANISHING.value: WatchCondition.Condition.tensor_too_small, | |||
| ConditionIdEnum.INF.value: WatchCondition.Condition.inf, | |||
| ConditionIdEnum.MAX_GT.value: WatchCondition.Condition.max_gt, | |||
| ConditionIdEnum.MAX_LT.value: WatchCondition.Condition.max_lt, | |||
| ConditionIdEnum.MAX_MIN_GT.value: WatchCondition.Condition.max_min_gt, | |||
| ConditionIdEnum.MAX_MIN_LT.value: WatchCondition.Condition.max_min_lt, | |||
| ConditionIdEnum.MEAN_GT.value: WatchCondition.Condition.mean_gt, | |||
| ConditionIdEnum.MEAN_LT.value: WatchCondition.Condition.mean_lt, | |||
| ConditionIdEnum.MIN_GT.value: WatchCondition.Condition.min_gt, | |||
| ConditionIdEnum.MIN_LT.value: WatchCondition.Condition.min_lt, | |||
| ConditionIdEnum.NAN.value: WatchCondition.Condition.nan, | |||
| ConditionIdEnum.OPERATOR_OVERFLOW.value: WatchCondition.Condition.overflow, | |||
| ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value: WatchCondition.Condition.overflow, | |||
| ConditionIdEnum.TENSOR_ALL_ZERO.value: WatchCondition.Condition.tensor_all_zero, | |||
| ConditionIdEnum.TENSOR_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization, | |||
| ConditionIdEnum.TENSOR_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow, | |||
| @@ -95,13 +95,9 @@ class WatchpointOperator: | |||
| def _validate_watch_condition(self, watch_condition): | |||
| """Validate watch condition.""" | |||
| metadata_stream = self._metadata_stream | |||
| if metadata_stream.backend == 'GPU' and watch_condition.get('id') in ( | |||
| ConditionIdEnum.OVERFLOW_ASCEND_CHIP.value, ConditionIdEnum.OPERATOR_OVERFLOW.value): | |||
| if metadata_stream.backend == 'GPU' and watch_condition.get('id') == ConditionIdEnum.OPERATOR_OVERFLOW.value: | |||
| log.error("GPU doesn't support overflow watch condition.") | |||
| raise DebuggerParamValueError("GPU doesn't support overflow watch condition.") | |||
| if metadata_stream.backend == 'Ascend' and watch_condition.get('id') == ConditionIdEnum.NAN.value: | |||
| log.error("Ascend doesn't support nan watch condition.") | |||
| raise DebuggerParamValueError("Ascend doesn't support nan watch condition.") | |||
| def update_watchpoint(self, params): | |||
| """ | |||
| @@ -1 +1 @@ | |||
| {"watch_points": [{"id": 1, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1.0}], "abbr": "MAX>"}}, {"id": 2, "watch_condition": {"id": "max_lt", "params": [{"name": "param", "value": -1.0}], "abbr": "MAX<"}}, {"id": 3, "watch_condition": {"id": "min_gt", "params": [{"name": "param", "value": 1e+32}], "abbr": "MIN>"}}, {"id": 5, "watch_condition": {"id": "max_min_gt", "params": [{"name": "param", "value": 0}], "abbr": "MAX-MIN>"}}, {"id": 6, "watch_condition": {"id": "max_min_lt", "params": [{"name": "param", "value": 0}], "abbr": "MAX-Min<"}}, {"id": 7, "watch_condition": {"id": "mean_gt", "params": [{"name": "param", "value": 0}], "abbr": "MEAN>"}}, {"id": 8, "watch_condition": {"id": "mean_lt", "params": [{"name": "param", "value": 0}], "abbr": "MEAN<"}}, {"id": 9, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 10, "watch_condition": {"id": "overflow", "params": [], "abbr": "OVERFLOW"}}]} | |||
| {"watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0}], "abbr": "TL"}}, {"id": 2, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "max_lt", "value": -1.0}], "abbr": "TS"}}, {"id": 3, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "min_gt", "value": 1e+32}], "abbr": "TL"}}, {"id": 5, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "mean_gt", "value": 0}], "abbr": "TL"}}, {"id": 6, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "mean_lt", "value": 0}], "abbr": "TS"}}]} | |||
| @@ -1 +1 @@ | |||
| {"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} | |||
| {"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} | |||
| @@ -1,47 +1 @@ | |||
| { | |||
| "watch_point_hits": [ | |||
| { | |||
| "node_name": "Default/TransData-op99", | |||
| "tensors": [ | |||
| { | |||
| "slot": "0", | |||
| "summarized_error_code": 0, | |||
| "watch_points": [ | |||
| { | |||
| "id": 1, | |||
| "watch_condition": { | |||
| "id": "inf", | |||
| "params": [], | |||
| "abbr": "INF" | |||
| }, | |||
| "error_code": 0 | |||
| } | |||
| ] | |||
| } | |||
| ], | |||
| "graph_name": "graph_0" | |||
| }, | |||
| { | |||
| "node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", | |||
| "tensors": [ | |||
| { | |||
| "slot": "0", | |||
| "summarized_error_code": 0, | |||
| "watch_points": [ | |||
| { | |||
| "id": 1, | |||
| "watch_condition": { | |||
| "id": "inf", | |||
| "params": [], | |||
| "abbr": "INF" | |||
| }, | |||
| "error_code": 0 | |||
| } | |||
| ] | |||
| } | |||
| ], | |||
| "graph_name": "graph_0" | |||
| } | |||
| ], | |||
| "outdated": false | |||
| } | |||
| {"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}], "outdated": false} | |||
| @@ -84,7 +84,7 @@ class TestAscendDebugger: | |||
| def test_get_conditions(self, app_client): | |||
| """Test get conditions for ascend.""" | |||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions' | |||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/condition-collections' | |||
| body_data = {} | |||
| expect_file = 'get_conditions_for_ascend.json' | |||
| with self._debugger_client.get_thread_instance(): | |||
| @@ -131,16 +131,12 @@ class TestAscendDebugger: | |||
| with self._debugger_client.get_thread_instance(): | |||
| check_state(app_client) | |||
| conditions = [ | |||
| {'id': 'max_gt', 'params': [{'name': 'param', 'value': 1.0}]}, | |||
| {'id': 'max_lt', 'params': [{'name': 'param', 'value': -1.0}]}, | |||
| {'id': 'min_gt', 'params': [{'name': 'param', 'value': 1e+32}]}, | |||
| {'id': 'min_lt', 'params': [{'name': 'param', 'value': -1e+32}]}, | |||
| {'id': 'max_min_gt', 'params': [{'name': 'param', 'value': 0}]}, | |||
| {'id': 'max_min_lt', 'params': [{'name': 'param', 'value': 0}]}, | |||
| {'id': 'mean_gt', 'params': [{'name': 'param', 'value': 0}]}, | |||
| {'id': 'mean_lt', 'params': [{'name': 'param', 'value': 0}]}, | |||
| {'id': 'inf', 'params': []}, | |||
| {'id': 'overflow', 'params': []}, | |||
| {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| {'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': -1.0}]}, | |||
| {'id': 'tensor_too_large', 'params': [{'name': 'min_gt', 'value': 1e+32}]}, | |||
| {'id': 'tensor_too_small', 'params': [{'name': 'min_lt', 'value': -1e+32}]}, | |||
| {'id': 'tensor_too_large', 'params': [{'name': 'mean_gt', 'value': 0}]}, | |||
| {'id': 'tensor_too_small', 'params': [{'name': 'mean_lt', 'value': 0}]} | |||
| ] | |||
| for idx, condition in enumerate(conditions): | |||
| create_watchpoint(app_client, condition, idx + 1) | |||
| @@ -167,7 +163,7 @@ class TestAscendDebugger: | |||
| leaf_node_name = 'Default/optimizer-Momentum/Parameter[18]_7/moments.fc3.bias' | |||
| with self._debugger_client.get_thread_instance(): | |||
| check_state(app_client) | |||
| condition = {'id': 'inf', 'params': []} | |||
| condition = {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]} | |||
| create_watchpoint(app_client, condition, watch_point_id) | |||
| # update watchpoint watchpoint list | |||
| url = 'update_watchpoint' | |||
| @@ -327,7 +323,7 @@ class TestAscendDebugger: | |||
| @pytest.mark.platform_x86_ascend_training | |||
| @pytest.mark.parametrize("url, body_data, enable_recheck", [ | |||
| ('create_watchpoint', | |||
| {'condition': {'id': 'inf', 'params': []}, | |||
| {'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| 'watch_nodes': ['Default']}, True), | |||
| ('update_watchpoint', | |||
| {'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | |||
| @@ -434,10 +430,10 @@ class TestGPUDebugger: | |||
| @pytest.mark.platform_x86_ascend_training | |||
| @pytest.mark.parametrize("url, body_data, enable_recheck", [ | |||
| ('create_watchpoint', | |||
| {'condition': {'id': 'inf', 'params': []}, | |||
| {'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| 'watch_nodes': ['Default']}, True), | |||
| ('create_watchpoint', | |||
| {'condition': {'id': 'inf', 'params': []}, | |||
| {'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| 'watch_nodes': ['Default/TransData-op99']}, True), | |||
| ('update_watchpoint', | |||
| {'watch_point_id': 1, 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | |||
| @@ -472,7 +468,7 @@ class TestGPUDebugger: | |||
| def test_get_conditions(self, app_client): | |||
| """Test get conditions for gpu.""" | |||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/conditions' | |||
| url = '/v1/mindinsight/conditionmgr/train-jobs/train-id/condition-collections' | |||
| body_data = {} | |||
| expect_file = 'get_conditions_for_gpu.json' | |||
| with self._debugger_client.get_thread_instance(): | |||
| @@ -493,7 +489,7 @@ class TestGPUDebugger: | |||
| # send recheck when disable to do recheck | |||
| get_request_result(app_client, 'recheck', {}, method='post', expect_code=400) | |||
| # send recheck when enable to do recheck | |||
| create_watchpoint(app_client, {'id': 'inf', 'params': []}, 2) | |||
| create_watchpoint(app_client, {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, 2) | |||
| res = get_request_result(app_client, 'recheck', {}, method='post') | |||
| assert res['metadata']['enable_recheck'] is False | |||
| @@ -579,10 +575,10 @@ class TestMultiGraphDebugger: | |||
| @pytest.mark.platform_x86_gpu_training | |||
| @pytest.mark.platform_x86_ascend_training | |||
| @pytest.mark.parametrize("filter_condition, expect_id", [ | |||
| ({'condition': {'id': 'inf'}, | |||
| ({'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| 'watch_nodes': ['Default/optimizer-Momentum/Parameter[18]_7'], | |||
| 'graph_name': 'graph_0'}, 1), | |||
| ({'condition': {'id': 'inf'}, | |||
| ({'condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| 'watch_nodes': ['graph_0/Default/optimizer-Momentum/ApplyMomentum[8]_1'], | |||
| 'graph_name': None}, 1) | |||
| ]) | |||
| @@ -665,7 +661,8 @@ def create_watchpoint(app_client, condition, expect_id): | |||
| def create_watchpoint_and_wait(app_client): | |||
| """Preparation for recheck.""" | |||
| check_state(app_client) | |||
| create_watchpoint(app_client, condition={'id': 'inf', 'params': []}, expect_id=1) | |||
| create_watchpoint(app_client, condition={'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| expect_id=1) | |||
| # send run command to get watchpoint hit | |||
| url = 'control' | |||
| body_data = {'mode': 'continue', | |||
| @@ -74,7 +74,7 @@ def send_and_save_result(app_client, url, body_data, file_path, method='post'): | |||
| def delete_random_items(res): | |||
| """delete the random items in metadata.""" | |||
| if res.get('metadata'): | |||
| if isinstance(res, dict) and res.get('metadata'): | |||
| if res['metadata'].get('ip'): | |||
| res['metadata'].pop('ip') | |||
| if res['metadata'].get('pos'): | |||
| @@ -1,25 +1,5 @@ | |||
| [ | |||
| { | |||
| "watchCondition": { | |||
| "condition": "inf" | |||
| }, | |||
| "id": 1, | |||
| "watch_nodes_num": 0 | |||
| }, | |||
| { | |||
| "watchCondition": { | |||
| "condition": "inf" | |||
| }, | |||
| "id": 2, | |||
| "watch_nodes_num": 172 | |||
| }, | |||
| { | |||
| "watchCondition": { | |||
| "condition": "max_gt", | |||
| "params": [{"name": "param", "value": 1}], | |||
| "value": 1 | |||
| }, | |||
| "id": 3, | |||
| "watch_nodes_num": 1 | |||
| } | |||
| {"watchCondition": {"condition": "tensor_too_small", "value": 1.0, "params": [{"name": "abs_mean_lt", "disabled": true}, {"name": "max_lt", "value": 1.0}, {"name": "min_lt", "disabled": true}, {"name": "mean_lt", "disabled": true}]}, "id": 1, "watch_nodes_num": 0}, | |||
| {"watchCondition": {"condition": "tensor_too_small", "value": 1.0, "params": [{"name": "abs_mean_lt", "disabled": true}, {"name": "max_lt", "disabled": true}, {"name": "min_lt", "value": 1.0}, {"name": "mean_lt", "disabled": true}]}, "id": 2, "watch_nodes_num": 172}, | |||
| {"watchCondition": {"condition": "tensor_too_large", "value": 1.0, "params": [{"name": "abs_mean_gt", "disabled": true}, {"name": "max_gt", "value": 1.0}, {"name": "min_gt", "disabled": true}, {"name": "mean_gt", "disabled": true}]}, "id": 3, "watch_nodes_num": 1} | |||
| ] | |||
| @@ -1 +1 @@ | |||
| [{"id": 1, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 2, "watch_condition": {"id": "inf", "params": [], "abbr": "INF"}}, {"id": 3, "watch_condition": {"id": "max_gt", "params": [{"name": "param", "value": 1}], "abbr": "MAX>"}}] | |||
| [{"id": 1, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "max_lt", "value": 1.0}], "abbr": "TS"}}, {"id": 2, "watch_condition": {"id": "tensor_too_small", "params": [{"name": "min_lt", "value": 1.0}], "abbr": "TS"}}, {"id": 3, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0}], "abbr": "TL"}}] | |||
| @@ -61,9 +61,9 @@ class TestWatchpointHandler: | |||
| def _create_watchpoint(self): | |||
| """Test create_watchpoint.""" | |||
| watchpoints = [ | |||
| ({'id': 'inf', 'params': []}, None, None, 1), | |||
| ({'id': 'inf', 'params': []}, ["Default"], None, 2), | |||
| ({'id': 'max_gt', 'params': [{'name': 'param', 'value': 1}]}, | |||
| ({'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': 1.0}]}, None, None, 1), | |||
| ({'id': 'tensor_too_small', 'params': [{'name': 'min_lt', 'value': 1.0}]}, ["Default"], None, 2), | |||
| ({'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| ["Gradients/Default/network-WithLossCell/_backbone-LeNet5/relu-ReLU/gradReLU/ReluGradV2-op92"], | |||
| None, 3) | |||
| ] | |||
| @@ -160,7 +160,8 @@ class TestWatchpointHandler: | |||
| expect_deleted_ids): | |||
| """Test delete_watchpoint.""" | |||
| for _ in range(watch_point_id): | |||
| self.handler.create_watchpoint(self.conditionmgr, {'id': 'inf', 'param': []}) | |||
| self.handler.create_watchpoint(self.conditionmgr, | |||
| {'id': 'tensor_too_small', 'params': [{'name': 'max_lt', 'value': 1.0}]}) | |||
| with TestCase().assertLogs(logger=log, level='DEBUG') as log_content: | |||
| self.handler.delete_watchpoint(watch_point_id) | |||
| TestCase().assertIn( | |||
| @@ -233,13 +234,13 @@ def test_validate_watch_condition_type_error(): | |||
| def test_validate_watch_condition_params_except(): | |||
| """Test validate_watch_condition_params.""" | |||
| watch_condition = {'id': 'inf', 'params': [{'name': 'param', 'value': 0}]} | |||
| watch_condition = {'id': 'weight_overflow', 'params': [{'name': 'param', 'value': 0}]} | |||
| conditionmgr = ConditionMgr() | |||
| with pytest.raises(DebuggerParamValueError) as err: | |||
| validate_watch_condition_params(conditionmgr, watch_condition) | |||
| assert err.value.error_code == '5054B081' | |||
| watch_condition = {'id': 'max_gt', 'params': [{'name': 'param', 'value': '0'}]} | |||
| watch_condition = {'id': 'tensor_overflow', 'params': [{'name': 'param', 'value': '0'}]} | |||
| with pytest.raises(DebuggerParamValueError) as err: | |||
| validate_watch_condition_params(conditionmgr, watch_condition) | |||
| assert err.value.error_code == '5054B081' | |||
| @@ -199,8 +199,9 @@ class TestDebuggerServer: | |||
| def test_create_watchpoint(self, *args): | |||
| """Test create watchpoint.""" | |||
| args[0].return_value = 1 | |||
| res = self._server.create_watchpoint({'watch_condition': {'id': 'inf'}, | |||
| 'watch_nodes': ['watch_node_name']}) | |||
| res = self._server.create_watchpoint( | |||
| {'watch_condition': {'id': 'tensor_too_large', 'params': [{'name': 'max_gt', 'value': 1.0}]}, | |||
| 'watch_nodes': ['watch_node_name']}) | |||
| assert res == {'id': 1, 'metadata': {'enable_recheck': False, 'state': 'waiting'}} | |||
| @mock.patch.object(MetadataHandler, 'state', 'waiting') | |||