diff --git a/mindinsight/datavisual/data_transform/summary_parser/event_parser.py b/mindinsight/datavisual/data_transform/summary_parser/event_parser.py index 676a5751..d7771525 100644 --- a/mindinsight/datavisual/data_transform/summary_parser/event_parser.py +++ b/mindinsight/datavisual/data_transform/summary_parser/event_parser.py @@ -61,16 +61,25 @@ class EventParser: parse_summary_logger.info("Loading %s.", self.summary_file) result = self._load(summary_file_handler) - warning = '' - if not self._scalar_check: - warning = warning + " the summary file contains no scalar value." - if not self._image_check: - warning = warning + " the summary file contains no image." if result: - parse_summary_logger.info("Writing parsed data into scalar.csv") + warning = '' + scalar_path = FileHandler.join(self._output, "scalar.csv") + image_path = FileHandler.join(self._output, IMAGE) + + if not self._image_check: + warning = warning + " The summary file contains no image." + else: + parse_summary_logger.info("Images are written in %s.", image_path) + + if not self._scalar_check: + warning = warning + " The summary file contains no scalar value." + else: + parse_summary_logger.info("Writing scalar data into %s.", scalar_path) + self._scalar_writer.write() if warning: parse_summary_logger.warning(warning) + parse_summary_logger.info("Finished loading %s.", self.summary_file) def _load(self, file_handler): diff --git a/mindinsight/debugger/conditionmgr/condition.py b/mindinsight/debugger/conditionmgr/condition.py index e75cedc8..ded17fc4 100644 --- a/mindinsight/debugger/conditionmgr/condition.py +++ b/mindinsight/debugger/conditionmgr/condition.py @@ -33,7 +33,6 @@ class ConditionIdEnum(Enum): GRADIENT_EXPLODING = "gradient_exploding" TENSOR_OVERFLOW = "tensor_overflow" OPERATOR_OVERFLOW = "operator_overflow" - TENSOR_INITIALIZATION = "tensor_initialization" TENSOR_TOO_LARGE = "tensor_too_large" TENSOR_TOO_SMALL = "tensor_too_small" TENSOR_ALL_ZERO = "tensor_all_zero" @@ -96,7 +95,7 @@ class ConditionContext: step (int): the type of value. debugger_capability (tuple): whether the param support no assignment. """ - def __init__(self, backend, step=0, debugger_capability=(1, 0)): + def __init__(self, backend, step=0, debugger_capability=(1, 1)): self._backend = backend self._step = step self._debugger_capability = debugger_capability diff --git a/mindinsight/debugger/conditionmgr/condition_list.py b/mindinsight/debugger/conditionmgr/condition_list.py index d64e3d7e..b4c18aff 100644 --- a/mindinsight/debugger/conditionmgr/condition_list.py +++ b/mindinsight/debugger/conditionmgr/condition_list.py @@ -224,34 +224,6 @@ CONDITION_LIST = [ supported_platforms=(PlatformEnum.ASCEND,), minimum_debugger_capability=(1, 1) ), - Condition( - condition_id=ConditionIdEnum.TENSOR_INITIALIZATION, - abbr="TI", - # Send this condition to MindSpore will use WatchCondition.Condition.tensor_initialization - optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, - parameters=[ - ConditionParameter( - name="zero_percentage_ge", - value_type=ValueTypeEnum.FLOAT64, - valid_test_func=check_percentage_param_range, - default_value=100 - ), - ConditionParameter( - name="max_gt", - value_type=ValueTypeEnum.FLOAT64, - valid_test_func=check_normal_param_range - ), - ConditionParameter( - name="min_lt", - value_type=ValueTypeEnum.FLOAT64, - valid_test_func=check_normal_param_range - ) - ], - supported_target_type=TargetTypeEnum.TENSOR, - supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), - minimum_debugger_capability=(1, 1), - availability_test_func=check_initialization_available - ), Condition( condition_id=ConditionIdEnum.TENSOR_TOO_LARGE, abbr="TL", diff --git a/mindinsight/debugger/debugger_server.py b/mindinsight/debugger/debugger_server.py index bd79aefa..e99a1287 100644 --- a/mindinsight/debugger/debugger_server.py +++ b/mindinsight/debugger/debugger_server.py @@ -71,7 +71,7 @@ class DebuggerServer: def get_condition_collections(self, train_id): """Get default condition_collections""" metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) - condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) + condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step) log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) return self.condition_mgr.get_all_collections(condition_context) @@ -81,7 +81,7 @@ class DebuggerServer: log.error("Bool param should be given for set_recommended") raise DebuggerParamValueError("Bool param should be given.") metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) - condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) + condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step) log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) res = metadata_stream.get(['state', 'enable_recheck']) if set_recommended and not metadata_stream.recommendation_confirmed: diff --git a/mindinsight/debugger/proto/debug_grpc.proto b/mindinsight/debugger/proto/debug_grpc.proto index d3283e5e..a364391f 100644 --- a/mindinsight/debugger/proto/debug_grpc.proto +++ b/mindinsight/debugger/proto/debug_grpc.proto @@ -91,6 +91,8 @@ message ViewCMD { message WatchCondition { enum Condition { + // nan won't be not used anymore, but the first enum value must be zero in proto3, so we keep this Enum member. + nan = 0; overflow = 2; sd_gt = 11; sd_lt = 12; diff --git a/mindinsight/debugger/proto/debug_grpc_pb2.py b/mindinsight/debugger/proto/debug_grpc_pb2.py index 4f0d4e64..c15ae513 100644 --- a/mindinsight/debugger/proto/debug_grpc_pb2.py +++ b/mindinsight/debugger/proto/debug_grpc_pb2.py @@ -21,7 +21,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( package='debugger', syntax='proto3', serialized_options=None, - serialized_pb=_b('\n+mindinsight/debugger/proto/debug_grpc.proto\x12\x08\x64\x65\x62ugger\x1a)mindinsight/debugger/proto/ms_graph.proto\"\x92\x01\n\x08Metadata\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x10\n\x08\x63ur_step\x18\x02 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x03 \x01(\t\x12\x10\n\x08\x63ur_node\x18\x04 \x01(\t\x12\x15\n\rtraining_done\x18\x05 \x01(\x08\x12\x11\n\tgraph_num\x18\x06 \x01(\x05\x12\x12\n\nms_version\x18\x07 \x01(\t\")\n\x05\x43hunk\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\x10\n\x08\x66inished\x18\x02 \x01(\x08\"\x87\x02\n\nEventReply\x12+\n\x06status\x18\x01 \x01(\x0e\x32\x1b.debugger.EventReply.Status\x12\x0e\n\x04\x65xit\x18\x02 \x01(\x08H\x00\x12#\n\x07run_cmd\x18\x03 \x01(\x0b\x32\x10.debugger.RunCMDH\x00\x12#\n\x07set_cmd\x18\x04 \x01(\x0b\x32\x10.debugger.SetCMDH\x00\x12%\n\x08view_cmd\x18\x05 \x01(\x0b\x32\x11.debugger.ViewCMDH\x00\x12\x19\n\x0fversion_matched\x18\x06 \x01(\x08H\x00\")\n\x06Status\x12\x06\n\x02OK\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x05\n\x03\x63md\"L\n\x06RunCMD\x12\x11\n\trun_level\x18\x01 \x01(\t\x12\x13\n\trun_steps\x18\x02 \x01(\x05H\x00\x12\x13\n\tnode_name\x18\x03 \x01(\tH\x00\x42\x05\n\x03\x63md\"\x81\x01\n\x06SetCMD\x12(\n\x0bwatch_nodes\x18\x01 \x03(\x0b\x32\x13.debugger.WatchNode\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\x0e\n\x06\x64\x65lete\x18\x03 \x01(\x08\x12\n\n\x02id\x18\x04 \x01(\x05\"1\n\x07ViewCMD\x12&\n\x07tensors\x18\x01 \x03(\x0b\x32\x15.debugger.TensorProto\"\xf4\x04\n\x0eWatchCondition\x12\x35\n\tcondition\x18\x01 \x01(\x0e\x32\".debugger.WatchCondition.Condition\x12\r\n\x05value\x18\x02 \x01(\x02\x12\x32\n\x06params\x18\x04 \x03(\x0b\x32\".debugger.WatchCondition.Parameter\x1a]\n\tParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64isabled\x18\x02 \x01(\x08\x12\r\n\x05value\x18\x03 \x01(\x01\x12\x0b\n\x03hit\x18\x04 \x01(\x08\x12\x14\n\x0c\x61\x63tual_value\x18\x05 \x01(\x01\"\x88\x03\n\tCondition\x12\x07\n\x03nan\x10\x00\x12\x07\n\x03inf\x10\x01\x12\x0c\n\x08overflow\x10\x02\x12\n\n\x06max_gt\x10\x03\x12\n\n\x06max_lt\x10\x04\x12\n\n\x06min_gt\x10\x05\x12\n\n\x06min_lt\x10\x06\x12\x0e\n\nmax_min_gt\x10\x07\x12\x0e\n\nmax_min_lt\x10\x08\x12\x0b\n\x07mean_gt\x10\t\x12\x0b\n\x07mean_lt\x10\n\x12\t\n\x05sd_gt\x10\x0b\x12\t\n\x05sd_lt\x10\x0c\x12\x1b\n\x17tensor_general_overflow\x10\r\x12\x19\n\x15tensor_initialization\x10\x0e\x12\x14\n\x10tensor_too_large\x10\x0f\x12\x14\n\x10tensor_too_small\x10\x10\x12\x13\n\x0ftensor_all_zero\x10\x11\x12\x1b\n\x17tensor_change_too_large\x10\x12\x12\x1b\n\x17tensor_change_too_small\x10\x13\x12\x16\n\x12tensor_not_changed\x10\x14\x12\x10\n\x0ctensor_range\x10\x15\"1\n\tWatchNode\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x11\n\tnode_type\x18\x02 \x01(\t\"\x89\x01\n\rWatchpointHit\x12%\n\x06tensor\x18\x01 \x01(\x0b\x32\x15.debugger.TensorProto\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\n\n\x02id\x18\x03 \x01(\x05\x12\x12\n\nerror_code\x18\x04 \x01(\x05\x32\x81\x03\n\rEventListener\x12\x35\n\x07WaitCMD\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12:\n\x0cSendMetadata\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12\x36\n\tSendGraph\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x12>\n\x0bSendTensors\x12\x15.debugger.TensorProto\x1a\x14.debugger.EventReply\"\x00(\x01\x12G\n\x12SendWatchpointHits\x12\x17.debugger.WatchpointHit\x1a\x14.debugger.EventReply\"\x00(\x01\x12<\n\x0fSendMultiGraphs\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x62\x06proto3') + serialized_pb=_b('\n+mindinsight/debugger/proto/debug_grpc.proto\x12\x08\x64\x65\x62ugger\x1a)mindinsight/debugger/proto/ms_graph.proto\"\x92\x01\n\x08Metadata\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x10\n\x08\x63ur_step\x18\x02 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x03 \x01(\t\x12\x10\n\x08\x63ur_node\x18\x04 \x01(\t\x12\x15\n\rtraining_done\x18\x05 \x01(\x08\x12\x11\n\tgraph_num\x18\x06 \x01(\x05\x12\x12\n\nms_version\x18\x07 \x01(\t\")\n\x05\x43hunk\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\x10\n\x08\x66inished\x18\x02 \x01(\x08\"\x87\x02\n\nEventReply\x12+\n\x06status\x18\x01 \x01(\x0e\x32\x1b.debugger.EventReply.Status\x12\x0e\n\x04\x65xit\x18\x02 \x01(\x08H\x00\x12#\n\x07run_cmd\x18\x03 \x01(\x0b\x32\x10.debugger.RunCMDH\x00\x12#\n\x07set_cmd\x18\x04 \x01(\x0b\x32\x10.debugger.SetCMDH\x00\x12%\n\x08view_cmd\x18\x05 \x01(\x0b\x32\x11.debugger.ViewCMDH\x00\x12\x19\n\x0fversion_matched\x18\x06 \x01(\x08H\x00\")\n\x06Status\x12\x06\n\x02OK\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x05\n\x03\x63md\"L\n\x06RunCMD\x12\x11\n\trun_level\x18\x01 \x01(\t\x12\x13\n\trun_steps\x18\x02 \x01(\x05H\x00\x12\x13\n\tnode_name\x18\x03 \x01(\tH\x00\x42\x05\n\x03\x63md\"\x81\x01\n\x06SetCMD\x12(\n\x0bwatch_nodes\x18\x01 \x03(\x0b\x32\x13.debugger.WatchNode\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\x0e\n\x06\x64\x65lete\x18\x03 \x01(\x08\x12\n\n\x02id\x18\x04 \x01(\x05\"1\n\x07ViewCMD\x12&\n\x07tensors\x18\x01 \x03(\x0b\x32\x15.debugger.TensorProto\"\x81\x04\n\x0eWatchCondition\x12\x35\n\tcondition\x18\x01 \x01(\x0e\x32\".debugger.WatchCondition.Condition\x12\r\n\x05value\x18\x02 \x01(\x02\x12\x32\n\x06params\x18\x04 \x03(\x0b\x32\".debugger.WatchCondition.Parameter\x1a]\n\tParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64isabled\x18\x02 \x01(\x08\x12\r\n\x05value\x18\x03 \x01(\x01\x12\x0b\n\x03hit\x18\x04 \x01(\x08\x12\x14\n\x0c\x61\x63tual_value\x18\x05 \x01(\x01\"\x95\x02\n\tCondition\x12\x07\n\x03nan\x10\x00\x12\x0c\n\x08overflow\x10\x02\x12\t\n\x05sd_gt\x10\x0b\x12\t\n\x05sd_lt\x10\x0c\x12\x1b\n\x17tensor_general_overflow\x10\r\x12\x19\n\x15tensor_initialization\x10\x0e\x12\x14\n\x10tensor_too_large\x10\x0f\x12\x14\n\x10tensor_too_small\x10\x10\x12\x13\n\x0ftensor_all_zero\x10\x11\x12\x1b\n\x17tensor_change_too_large\x10\x12\x12\x1b\n\x17tensor_change_too_small\x10\x13\x12\x16\n\x12tensor_not_changed\x10\x14\x12\x10\n\x0ctensor_range\x10\x15\"1\n\tWatchNode\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x11\n\tnode_type\x18\x02 \x01(\t\"\x89\x01\n\rWatchpointHit\x12%\n\x06tensor\x18\x01 \x01(\x0b\x32\x15.debugger.TensorProto\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\n\n\x02id\x18\x03 \x01(\x05\x12\x12\n\nerror_code\x18\x04 \x01(\x05\x32\x81\x03\n\rEventListener\x12\x35\n\x07WaitCMD\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12:\n\x0cSendMetadata\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12\x36\n\tSendGraph\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x12>\n\x0bSendTensors\x12\x15.debugger.TensorProto\x1a\x14.debugger.EventReply\"\x00(\x01\x12G\n\x12SendWatchpointHits\x12\x17.debugger.WatchpointHit\x1a\x14.debugger.EventReply\"\x00(\x01\x12<\n\x0fSendMultiGraphs\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x62\x06proto3') , dependencies=[mindinsight_dot_debugger_dot_proto_dot_ms__graph__pb2.DESCRIPTOR,]) @@ -64,94 +64,58 @@ _WATCHCONDITION_CONDITION = _descriptor.EnumDescriptor( serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='inf', index=1, number=1, + name='overflow', index=1, number=2, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='overflow', index=2, number=2, + name='sd_gt', index=2, number=11, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='max_gt', index=3, number=3, + name='sd_lt', index=3, number=12, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='max_lt', index=4, number=4, + name='tensor_general_overflow', index=4, number=13, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='min_gt', index=5, number=5, + name='tensor_initialization', index=5, number=14, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='min_lt', index=6, number=6, + name='tensor_too_large', index=6, number=15, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='max_min_gt', index=7, number=7, + name='tensor_too_small', index=7, number=16, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='max_min_lt', index=8, number=8, + name='tensor_all_zero', index=8, number=17, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='mean_gt', index=9, number=9, + name='tensor_change_too_large', index=9, number=18, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='mean_lt', index=10, number=10, + name='tensor_change_too_small', index=10, number=19, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='sd_gt', index=11, number=11, + name='tensor_not_changed', index=11, number=20, serialized_options=None, type=None), _descriptor.EnumValueDescriptor( - name='sd_lt', index=12, number=12, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_general_overflow', index=13, number=13, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_initialization', index=14, number=14, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_too_large', index=15, number=15, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_too_small', index=16, number=16, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_all_zero', index=17, number=17, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_change_too_large', index=18, number=18, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_change_too_small', index=19, number=19, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_not_changed', index=20, number=20, - serialized_options=None, - type=None), - _descriptor.EnumValueDescriptor( - name='tensor_range', index=21, number=21, + name='tensor_range', index=12, number=21, serialized_options=None, type=None), ], containing_type=None, serialized_options=None, serialized_start=1056, - serialized_end=1448, + serialized_end=1333, ) _sym_db.RegisterEnumDescriptor(_WATCHCONDITION_CONDITION) @@ -568,7 +532,7 @@ _WATCHCONDITION = _descriptor.Descriptor( oneofs=[ ], serialized_start=820, - serialized_end=1448, + serialized_end=1333, ) @@ -605,8 +569,8 @@ _WATCHNODE = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1450, - serialized_end=1499, + serialized_start=1335, + serialized_end=1384, ) @@ -657,8 +621,8 @@ _WATCHPOINTHIT = _descriptor.Descriptor( extension_ranges=[], oneofs=[ ], - serialized_start=1502, - serialized_end=1639, + serialized_start=1387, + serialized_end=1524, ) _EVENTREPLY.fields_by_name['status'].enum_type = _EVENTREPLY_STATUS @@ -786,8 +750,8 @@ _EVENTLISTENER = _descriptor.ServiceDescriptor( file=DESCRIPTOR, index=0, serialized_options=None, - serialized_start=1642, - serialized_end=2027, + serialized_start=1527, + serialized_end=1912, methods=[ _descriptor.MethodDescriptor( name='WaitCMD', diff --git a/mindinsight/debugger/stream_cache/watchpoint.py b/mindinsight/debugger/stream_cache/watchpoint.py index c4c02a92..3163d44a 100644 --- a/mindinsight/debugger/stream_cache/watchpoint.py +++ b/mindinsight/debugger/stream_cache/watchpoint.py @@ -31,7 +31,6 @@ WATCHPOINT_CONDITION_MAPPING = { ConditionIdEnum.GRADIENT_VANISHING.value: WatchCondition.Condition.tensor_too_small, ConditionIdEnum.OPERATOR_OVERFLOW.value: WatchCondition.Condition.overflow, ConditionIdEnum.TENSOR_ALL_ZERO.value: WatchCondition.Condition.tensor_all_zero, - ConditionIdEnum.TENSOR_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization, ConditionIdEnum.TENSOR_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow, ConditionIdEnum.TENSOR_RANGE.value: WatchCondition.Condition.tensor_range, ConditionIdEnum.TENSOR_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large, diff --git a/mindinsight/debugger/stream_handler/watchpoint_handler.py b/mindinsight/debugger/stream_handler/watchpoint_handler.py index bf9c27bc..4ff140ea 100644 --- a/mindinsight/debugger/stream_handler/watchpoint_handler.py +++ b/mindinsight/debugger/stream_handler/watchpoint_handler.py @@ -436,23 +436,18 @@ class WatchpointHitHandler(StreamHandlerBase): """ res = {} watch_points = [] - error_codes = set() + for tensor_hit in tensor_hits: error_code = tensor_hit.error_code + error_list = _get_error_list(error_code) watchpoint = tensor_hit.watchpoint watchpoint['error_code'] = error_code + watchpoint['error_list'] = error_list watch_points.append(watchpoint) - error_codes.add(error_code) - - summarized_error_code = error_codes.pop() - while error_codes: - temp = error_codes.pop() - summarized_error_code = summarized_error_code | temp if watch_points: res = { 'slot': slot, - 'summarized_error_code': summarized_error_code, 'watch_points': watch_points } return res @@ -617,3 +612,22 @@ def set_default_param(condition_mgr, watch_condition): }) watch_condition["abbr"] = condition.abbr return watch_condition + + +def _get_error_list(error_code): + """ + Get error list. + Args: + error_code (int): the code of errors. + + Returns: + list, the error list. + """ + all_error_list = ["nan", "inf", "no_prev_tensor"] + error_list = [] + for i, error_str in enumerate(all_error_list): + error = (error_code >> i) & 1 + if error == 1: + error_list.append(error_str) + + return error_list diff --git a/mindinsight/debugger/stream_operator/watchpoint_operator.py b/mindinsight/debugger/stream_operator/watchpoint_operator.py index 3e6030e6..0aa8c7e4 100644 --- a/mindinsight/debugger/stream_operator/watchpoint_operator.py +++ b/mindinsight/debugger/stream_operator/watchpoint_operator.py @@ -21,7 +21,7 @@ from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValue from mindinsight.debugger.common.log import LOGGER as log from mindinsight.debugger.common.utils import ServerStatus, \ Streams, is_cst_type -from mindinsight.debugger.conditionmgr.condition import ConditionIdEnum, TargetTypeEnum +from mindinsight.debugger.conditionmgr.condition import ConditionIdEnum, TargetTypeEnum, ConditionContext from mindinsight.debugger.conditionmgr.recommender import get_basic_node_info from mindinsight.debugger.stream_handler.watchpoint_handler import validate_watch_condition @@ -78,6 +78,12 @@ class WatchpointOperator: validate_watch_condition(self._condition_mgr, watch_condition) condition_id = watch_condition.get('id') condition = self._condition_mgr.get_condition(condition_id) + condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step) + if not condition.is_available(condition_context): + log.error("Failed to create watchpoint as the condition is not available.") + raise DebuggerCreateWatchPointError( + "Failed to create watchpoint as the condition is not available.") + if condition.supported_target_type in [TargetTypeEnum.ACTIVATION, TargetTypeEnum.GRADIENT, TargetTypeEnum.WEIGHT]: watch_nodes = get_basic_node_info(condition.supported_target_type.value, self._graph_stream).copy() diff --git a/tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json b/tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json index 59c5fa63..326a4d04 100644 --- a/tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json +++ b/tests/st/func/debugger/expect_results/restful_results/retrieve_tensor_graph-0.json @@ -1 +1 @@ -{"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} \ No newline at end of file +{"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0, "error_list": []}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} \ No newline at end of file diff --git a/tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json b/tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json index dd3e0c34..ddd76840 100644 --- a/tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json +++ b/tests/st/func/debugger/expect_results/restful_results/retrieve_watchpoint_hit.json @@ -1 +1 @@ -{"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}], "outdated": false} \ No newline at end of file +{"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0, "error_list": []}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0, "error_list": []}]}], "graph_name": "graph_0"}], "outdated": false} \ No newline at end of file diff --git a/tests/ut/debugger/expected_results/watchpoint/watchpoint_hit_handler_get_0.json b/tests/ut/debugger/expected_results/watchpoint/watchpoint_hit_handler_get_0.json index a4cd9268..3d1111fc 100644 --- a/tests/ut/debugger/expected_results/watchpoint/watchpoint_hit_handler_get_0.json +++ b/tests/ut/debugger/expected_results/watchpoint/watchpoint_hit_handler_get_0.json @@ -5,11 +5,11 @@ "tensors": [ { "slot": "0", - "summarized_error_code": 0, "watch_points": [ { "id": 1, "error_code": 0, + "error_list": [], "watch_condition": { "condition": "MAX_GT", "param": 1 diff --git a/tests/ut/debugger/test_debugger_server.py b/tests/ut/debugger/test_debugger_server.py index e1963cf3..4e9e67c3 100644 --- a/tests/ut/debugger/test_debugger_server.py +++ b/tests/ut/debugger/test_debugger_server.py @@ -193,6 +193,7 @@ class TestDebuggerServer: self._server.create_watchpoint({'watch_condition': {'id': 'inf'}}) @mock.patch.object(MetadataHandler, 'state', 'waiting') + @mock.patch.object(MetadataHandler, 'backend', 'GPU') @mock.patch.object(GraphHandler, 'get_node_basic_info', return_value=MagicMock()) @mock.patch.object(GraphHandler, 'get_node_type', return_value='aggregation_scope') @mock.patch.object(WatchpointHandler, 'create_watchpoint')