From: @jiang-shuqiang Reviewed-by: Signed-off-by:tags/v1.1.0
| @@ -61,16 +61,25 @@ class EventParser: | |||
| parse_summary_logger.info("Loading %s.", self.summary_file) | |||
| result = self._load(summary_file_handler) | |||
| warning = '' | |||
| if not self._scalar_check: | |||
| warning = warning + " the summary file contains no scalar value." | |||
| if not self._image_check: | |||
| warning = warning + " the summary file contains no image." | |||
| if result: | |||
| parse_summary_logger.info("Writing parsed data into scalar.csv") | |||
| warning = '' | |||
| scalar_path = FileHandler.join(self._output, "scalar.csv") | |||
| image_path = FileHandler.join(self._output, IMAGE) | |||
| if not self._image_check: | |||
| warning = warning + " The summary file contains no image." | |||
| else: | |||
| parse_summary_logger.info("Images are written in %s.", image_path) | |||
| if not self._scalar_check: | |||
| warning = warning + " The summary file contains no scalar value." | |||
| else: | |||
| parse_summary_logger.info("Writing scalar data into %s.", scalar_path) | |||
| self._scalar_writer.write() | |||
| if warning: | |||
| parse_summary_logger.warning(warning) | |||
| parse_summary_logger.info("Finished loading %s.", self.summary_file) | |||
| def _load(self, file_handler): | |||
| @@ -33,7 +33,6 @@ class ConditionIdEnum(Enum): | |||
| GRADIENT_EXPLODING = "gradient_exploding" | |||
| TENSOR_OVERFLOW = "tensor_overflow" | |||
| OPERATOR_OVERFLOW = "operator_overflow" | |||
| TENSOR_INITIALIZATION = "tensor_initialization" | |||
| TENSOR_TOO_LARGE = "tensor_too_large" | |||
| TENSOR_TOO_SMALL = "tensor_too_small" | |||
| TENSOR_ALL_ZERO = "tensor_all_zero" | |||
| @@ -96,7 +95,7 @@ class ConditionContext: | |||
| step (int): the type of value. | |||
| debugger_capability (tuple): whether the param support no assignment. | |||
| """ | |||
| def __init__(self, backend, step=0, debugger_capability=(1, 0)): | |||
| def __init__(self, backend, step=0, debugger_capability=(1, 1)): | |||
| self._backend = backend | |||
| self._step = step | |||
| self._debugger_capability = debugger_capability | |||
| @@ -224,34 +224,6 @@ CONDITION_LIST = [ | |||
| supported_platforms=(PlatformEnum.ASCEND,), | |||
| minimum_debugger_capability=(1, 1) | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.TENSOR_INITIALIZATION, | |||
| abbr="TI", | |||
| # Send this condition to MindSpore will use WatchCondition.Condition.tensor_initialization | |||
| optimize_phase=OptimizePhaseEnum.TENSOR_CHECK, | |||
| parameters=[ | |||
| ConditionParameter( | |||
| name="zero_percentage_ge", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_percentage_param_range, | |||
| default_value=100 | |||
| ), | |||
| ConditionParameter( | |||
| name="max_gt", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ), | |||
| ConditionParameter( | |||
| name="min_lt", | |||
| value_type=ValueTypeEnum.FLOAT64, | |||
| valid_test_func=check_normal_param_range | |||
| ) | |||
| ], | |||
| supported_target_type=TargetTypeEnum.TENSOR, | |||
| supported_platforms=(PlatformEnum.ASCEND, PlatformEnum.GPU), | |||
| minimum_debugger_capability=(1, 1), | |||
| availability_test_func=check_initialization_available | |||
| ), | |||
| Condition( | |||
| condition_id=ConditionIdEnum.TENSOR_TOO_LARGE, | |||
| abbr="TL", | |||
| @@ -71,7 +71,7 @@ class DebuggerServer: | |||
| def get_condition_collections(self, train_id): | |||
| """Get default condition_collections""" | |||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step) | |||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | |||
| return self.condition_mgr.get_all_collections(condition_context) | |||
| @@ -81,7 +81,7 @@ class DebuggerServer: | |||
| log.error("Bool param should be given for set_recommended") | |||
| raise DebuggerParamValueError("Bool param should be given.") | |||
| metadata_stream = self.cache_store.get_stream_handler(Streams.METADATA) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step, (1, 1)) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step) | |||
| log.debug("Train_id: %s, backend: %s", train_id, condition_context.backend) | |||
| res = metadata_stream.get(['state', 'enable_recheck']) | |||
| if set_recommended and not metadata_stream.recommendation_confirmed: | |||
| @@ -91,6 +91,8 @@ message ViewCMD { | |||
| message WatchCondition { | |||
| enum Condition { | |||
| // nan won't be not used anymore, but the first enum value must be zero in proto3, so we keep this Enum member. | |||
| nan = 0; | |||
| overflow = 2; | |||
| sd_gt = 11; | |||
| sd_lt = 12; | |||
| @@ -21,7 +21,7 @@ DESCRIPTOR = _descriptor.FileDescriptor( | |||
| package='debugger', | |||
| syntax='proto3', | |||
| serialized_options=None, | |||
| serialized_pb=_b('\n+mindinsight/debugger/proto/debug_grpc.proto\x12\x08\x64\x65\x62ugger\x1a)mindinsight/debugger/proto/ms_graph.proto\"\x92\x01\n\x08Metadata\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x10\n\x08\x63ur_step\x18\x02 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x03 \x01(\t\x12\x10\n\x08\x63ur_node\x18\x04 \x01(\t\x12\x15\n\rtraining_done\x18\x05 \x01(\x08\x12\x11\n\tgraph_num\x18\x06 \x01(\x05\x12\x12\n\nms_version\x18\x07 \x01(\t\")\n\x05\x43hunk\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\x10\n\x08\x66inished\x18\x02 \x01(\x08\"\x87\x02\n\nEventReply\x12+\n\x06status\x18\x01 \x01(\x0e\x32\x1b.debugger.EventReply.Status\x12\x0e\n\x04\x65xit\x18\x02 \x01(\x08H\x00\x12#\n\x07run_cmd\x18\x03 \x01(\x0b\x32\x10.debugger.RunCMDH\x00\x12#\n\x07set_cmd\x18\x04 \x01(\x0b\x32\x10.debugger.SetCMDH\x00\x12%\n\x08view_cmd\x18\x05 \x01(\x0b\x32\x11.debugger.ViewCMDH\x00\x12\x19\n\x0fversion_matched\x18\x06 \x01(\x08H\x00\")\n\x06Status\x12\x06\n\x02OK\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x05\n\x03\x63md\"L\n\x06RunCMD\x12\x11\n\trun_level\x18\x01 \x01(\t\x12\x13\n\trun_steps\x18\x02 \x01(\x05H\x00\x12\x13\n\tnode_name\x18\x03 \x01(\tH\x00\x42\x05\n\x03\x63md\"\x81\x01\n\x06SetCMD\x12(\n\x0bwatch_nodes\x18\x01 \x03(\x0b\x32\x13.debugger.WatchNode\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\x0e\n\x06\x64\x65lete\x18\x03 \x01(\x08\x12\n\n\x02id\x18\x04 \x01(\x05\"1\n\x07ViewCMD\x12&\n\x07tensors\x18\x01 \x03(\x0b\x32\x15.debugger.TensorProto\"\xf4\x04\n\x0eWatchCondition\x12\x35\n\tcondition\x18\x01 \x01(\x0e\x32\".debugger.WatchCondition.Condition\x12\r\n\x05value\x18\x02 \x01(\x02\x12\x32\n\x06params\x18\x04 \x03(\x0b\x32\".debugger.WatchCondition.Parameter\x1a]\n\tParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64isabled\x18\x02 \x01(\x08\x12\r\n\x05value\x18\x03 \x01(\x01\x12\x0b\n\x03hit\x18\x04 \x01(\x08\x12\x14\n\x0c\x61\x63tual_value\x18\x05 \x01(\x01\"\x88\x03\n\tCondition\x12\x07\n\x03nan\x10\x00\x12\x07\n\x03inf\x10\x01\x12\x0c\n\x08overflow\x10\x02\x12\n\n\x06max_gt\x10\x03\x12\n\n\x06max_lt\x10\x04\x12\n\n\x06min_gt\x10\x05\x12\n\n\x06min_lt\x10\x06\x12\x0e\n\nmax_min_gt\x10\x07\x12\x0e\n\nmax_min_lt\x10\x08\x12\x0b\n\x07mean_gt\x10\t\x12\x0b\n\x07mean_lt\x10\n\x12\t\n\x05sd_gt\x10\x0b\x12\t\n\x05sd_lt\x10\x0c\x12\x1b\n\x17tensor_general_overflow\x10\r\x12\x19\n\x15tensor_initialization\x10\x0e\x12\x14\n\x10tensor_too_large\x10\x0f\x12\x14\n\x10tensor_too_small\x10\x10\x12\x13\n\x0ftensor_all_zero\x10\x11\x12\x1b\n\x17tensor_change_too_large\x10\x12\x12\x1b\n\x17tensor_change_too_small\x10\x13\x12\x16\n\x12tensor_not_changed\x10\x14\x12\x10\n\x0ctensor_range\x10\x15\"1\n\tWatchNode\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x11\n\tnode_type\x18\x02 \x01(\t\"\x89\x01\n\rWatchpointHit\x12%\n\x06tensor\x18\x01 \x01(\x0b\x32\x15.debugger.TensorProto\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\n\n\x02id\x18\x03 \x01(\x05\x12\x12\n\nerror_code\x18\x04 \x01(\x05\x32\x81\x03\n\rEventListener\x12\x35\n\x07WaitCMD\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12:\n\x0cSendMetadata\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12\x36\n\tSendGraph\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x12>\n\x0bSendTensors\x12\x15.debugger.TensorProto\x1a\x14.debugger.EventReply\"\x00(\x01\x12G\n\x12SendWatchpointHits\x12\x17.debugger.WatchpointHit\x1a\x14.debugger.EventReply\"\x00(\x01\x12<\n\x0fSendMultiGraphs\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x62\x06proto3') | |||
| serialized_pb=_b('\n+mindinsight/debugger/proto/debug_grpc.proto\x12\x08\x64\x65\x62ugger\x1a)mindinsight/debugger/proto/ms_graph.proto\"\x92\x01\n\x08Metadata\x12\x13\n\x0b\x64\x65vice_name\x18\x01 \x01(\t\x12\x10\n\x08\x63ur_step\x18\x02 \x01(\x05\x12\x0f\n\x07\x62\x61\x63kend\x18\x03 \x01(\t\x12\x10\n\x08\x63ur_node\x18\x04 \x01(\t\x12\x15\n\rtraining_done\x18\x05 \x01(\x08\x12\x11\n\tgraph_num\x18\x06 \x01(\x05\x12\x12\n\nms_version\x18\x07 \x01(\t\")\n\x05\x43hunk\x12\x0e\n\x06\x62uffer\x18\x01 \x01(\x0c\x12\x10\n\x08\x66inished\x18\x02 \x01(\x08\"\x87\x02\n\nEventReply\x12+\n\x06status\x18\x01 \x01(\x0e\x32\x1b.debugger.EventReply.Status\x12\x0e\n\x04\x65xit\x18\x02 \x01(\x08H\x00\x12#\n\x07run_cmd\x18\x03 \x01(\x0b\x32\x10.debugger.RunCMDH\x00\x12#\n\x07set_cmd\x18\x04 \x01(\x0b\x32\x10.debugger.SetCMDH\x00\x12%\n\x08view_cmd\x18\x05 \x01(\x0b\x32\x11.debugger.ViewCMDH\x00\x12\x19\n\x0fversion_matched\x18\x06 \x01(\x08H\x00\")\n\x06Status\x12\x06\n\x02OK\x10\x00\x12\n\n\x06\x46\x41ILED\x10\x01\x12\x0b\n\x07PENDING\x10\x02\x42\x05\n\x03\x63md\"L\n\x06RunCMD\x12\x11\n\trun_level\x18\x01 \x01(\t\x12\x13\n\trun_steps\x18\x02 \x01(\x05H\x00\x12\x13\n\tnode_name\x18\x03 \x01(\tH\x00\x42\x05\n\x03\x63md\"\x81\x01\n\x06SetCMD\x12(\n\x0bwatch_nodes\x18\x01 \x03(\x0b\x32\x13.debugger.WatchNode\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\x0e\n\x06\x64\x65lete\x18\x03 \x01(\x08\x12\n\n\x02id\x18\x04 \x01(\x05\"1\n\x07ViewCMD\x12&\n\x07tensors\x18\x01 \x03(\x0b\x32\x15.debugger.TensorProto\"\x81\x04\n\x0eWatchCondition\x12\x35\n\tcondition\x18\x01 \x01(\x0e\x32\".debugger.WatchCondition.Condition\x12\r\n\x05value\x18\x02 \x01(\x02\x12\x32\n\x06params\x18\x04 \x03(\x0b\x32\".debugger.WatchCondition.Parameter\x1a]\n\tParameter\x12\x0c\n\x04name\x18\x01 \x01(\t\x12\x10\n\x08\x64isabled\x18\x02 \x01(\x08\x12\r\n\x05value\x18\x03 \x01(\x01\x12\x0b\n\x03hit\x18\x04 \x01(\x08\x12\x14\n\x0c\x61\x63tual_value\x18\x05 \x01(\x01\"\x95\x02\n\tCondition\x12\x07\n\x03nan\x10\x00\x12\x0c\n\x08overflow\x10\x02\x12\t\n\x05sd_gt\x10\x0b\x12\t\n\x05sd_lt\x10\x0c\x12\x1b\n\x17tensor_general_overflow\x10\r\x12\x19\n\x15tensor_initialization\x10\x0e\x12\x14\n\x10tensor_too_large\x10\x0f\x12\x14\n\x10tensor_too_small\x10\x10\x12\x13\n\x0ftensor_all_zero\x10\x11\x12\x1b\n\x17tensor_change_too_large\x10\x12\x12\x1b\n\x17tensor_change_too_small\x10\x13\x12\x16\n\x12tensor_not_changed\x10\x14\x12\x10\n\x0ctensor_range\x10\x15\"1\n\tWatchNode\x12\x11\n\tnode_name\x18\x01 \x01(\t\x12\x11\n\tnode_type\x18\x02 \x01(\t\"\x89\x01\n\rWatchpointHit\x12%\n\x06tensor\x18\x01 \x01(\x0b\x32\x15.debugger.TensorProto\x12\x31\n\x0fwatch_condition\x18\x02 \x01(\x0b\x32\x18.debugger.WatchCondition\x12\n\n\x02id\x18\x03 \x01(\x05\x12\x12\n\nerror_code\x18\x04 \x01(\x05\x32\x81\x03\n\rEventListener\x12\x35\n\x07WaitCMD\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12:\n\x0cSendMetadata\x12\x12.debugger.Metadata\x1a\x14.debugger.EventReply\"\x00\x12\x36\n\tSendGraph\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x12>\n\x0bSendTensors\x12\x15.debugger.TensorProto\x1a\x14.debugger.EventReply\"\x00(\x01\x12G\n\x12SendWatchpointHits\x12\x17.debugger.WatchpointHit\x1a\x14.debugger.EventReply\"\x00(\x01\x12<\n\x0fSendMultiGraphs\x12\x0f.debugger.Chunk\x1a\x14.debugger.EventReply\"\x00(\x01\x62\x06proto3') | |||
| , | |||
| dependencies=[mindinsight_dot_debugger_dot_proto_dot_ms__graph__pb2.DESCRIPTOR,]) | |||
| @@ -64,94 +64,58 @@ _WATCHCONDITION_CONDITION = _descriptor.EnumDescriptor( | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='inf', index=1, number=1, | |||
| name='overflow', index=1, number=2, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='overflow', index=2, number=2, | |||
| name='sd_gt', index=2, number=11, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='max_gt', index=3, number=3, | |||
| name='sd_lt', index=3, number=12, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='max_lt', index=4, number=4, | |||
| name='tensor_general_overflow', index=4, number=13, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='min_gt', index=5, number=5, | |||
| name='tensor_initialization', index=5, number=14, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='min_lt', index=6, number=6, | |||
| name='tensor_too_large', index=6, number=15, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='max_min_gt', index=7, number=7, | |||
| name='tensor_too_small', index=7, number=16, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='max_min_lt', index=8, number=8, | |||
| name='tensor_all_zero', index=8, number=17, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='mean_gt', index=9, number=9, | |||
| name='tensor_change_too_large', index=9, number=18, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='mean_lt', index=10, number=10, | |||
| name='tensor_change_too_small', index=10, number=19, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='sd_gt', index=11, number=11, | |||
| name='tensor_not_changed', index=11, number=20, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='sd_lt', index=12, number=12, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_general_overflow', index=13, number=13, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_initialization', index=14, number=14, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_too_large', index=15, number=15, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_too_small', index=16, number=16, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_all_zero', index=17, number=17, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_change_too_large', index=18, number=18, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_change_too_small', index=19, number=19, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_not_changed', index=20, number=20, | |||
| serialized_options=None, | |||
| type=None), | |||
| _descriptor.EnumValueDescriptor( | |||
| name='tensor_range', index=21, number=21, | |||
| name='tensor_range', index=12, number=21, | |||
| serialized_options=None, | |||
| type=None), | |||
| ], | |||
| containing_type=None, | |||
| serialized_options=None, | |||
| serialized_start=1056, | |||
| serialized_end=1448, | |||
| serialized_end=1333, | |||
| ) | |||
| _sym_db.RegisterEnumDescriptor(_WATCHCONDITION_CONDITION) | |||
| @@ -568,7 +532,7 @@ _WATCHCONDITION = _descriptor.Descriptor( | |||
| oneofs=[ | |||
| ], | |||
| serialized_start=820, | |||
| serialized_end=1448, | |||
| serialized_end=1333, | |||
| ) | |||
| @@ -605,8 +569,8 @@ _WATCHNODE = _descriptor.Descriptor( | |||
| extension_ranges=[], | |||
| oneofs=[ | |||
| ], | |||
| serialized_start=1450, | |||
| serialized_end=1499, | |||
| serialized_start=1335, | |||
| serialized_end=1384, | |||
| ) | |||
| @@ -657,8 +621,8 @@ _WATCHPOINTHIT = _descriptor.Descriptor( | |||
| extension_ranges=[], | |||
| oneofs=[ | |||
| ], | |||
| serialized_start=1502, | |||
| serialized_end=1639, | |||
| serialized_start=1387, | |||
| serialized_end=1524, | |||
| ) | |||
| _EVENTREPLY.fields_by_name['status'].enum_type = _EVENTREPLY_STATUS | |||
| @@ -786,8 +750,8 @@ _EVENTLISTENER = _descriptor.ServiceDescriptor( | |||
| file=DESCRIPTOR, | |||
| index=0, | |||
| serialized_options=None, | |||
| serialized_start=1642, | |||
| serialized_end=2027, | |||
| serialized_start=1527, | |||
| serialized_end=1912, | |||
| methods=[ | |||
| _descriptor.MethodDescriptor( | |||
| name='WaitCMD', | |||
| @@ -31,7 +31,6 @@ WATCHPOINT_CONDITION_MAPPING = { | |||
| ConditionIdEnum.GRADIENT_VANISHING.value: WatchCondition.Condition.tensor_too_small, | |||
| ConditionIdEnum.OPERATOR_OVERFLOW.value: WatchCondition.Condition.overflow, | |||
| ConditionIdEnum.TENSOR_ALL_ZERO.value: WatchCondition.Condition.tensor_all_zero, | |||
| ConditionIdEnum.TENSOR_INITIALIZATION.value: WatchCondition.Condition.tensor_initialization, | |||
| ConditionIdEnum.TENSOR_OVERFLOW.value: WatchCondition.Condition.tensor_general_overflow, | |||
| ConditionIdEnum.TENSOR_RANGE.value: WatchCondition.Condition.tensor_range, | |||
| ConditionIdEnum.TENSOR_TOO_LARGE.value: WatchCondition.Condition.tensor_too_large, | |||
| @@ -436,23 +436,18 @@ class WatchpointHitHandler(StreamHandlerBase): | |||
| """ | |||
| res = {} | |||
| watch_points = [] | |||
| error_codes = set() | |||
| for tensor_hit in tensor_hits: | |||
| error_code = tensor_hit.error_code | |||
| error_list = _get_error_list(error_code) | |||
| watchpoint = tensor_hit.watchpoint | |||
| watchpoint['error_code'] = error_code | |||
| watchpoint['error_list'] = error_list | |||
| watch_points.append(watchpoint) | |||
| error_codes.add(error_code) | |||
| summarized_error_code = error_codes.pop() | |||
| while error_codes: | |||
| temp = error_codes.pop() | |||
| summarized_error_code = summarized_error_code | temp | |||
| if watch_points: | |||
| res = { | |||
| 'slot': slot, | |||
| 'summarized_error_code': summarized_error_code, | |||
| 'watch_points': watch_points | |||
| } | |||
| return res | |||
| @@ -617,3 +612,22 @@ def set_default_param(condition_mgr, watch_condition): | |||
| }) | |||
| watch_condition["abbr"] = condition.abbr | |||
| return watch_condition | |||
| def _get_error_list(error_code): | |||
| """ | |||
| Get error list. | |||
| Args: | |||
| error_code (int): the code of errors. | |||
| Returns: | |||
| list, the error list. | |||
| """ | |||
| all_error_list = ["nan", "inf", "no_prev_tensor"] | |||
| error_list = [] | |||
| for i, error_str in enumerate(all_error_list): | |||
| error = (error_code >> i) & 1 | |||
| if error == 1: | |||
| error_list.append(error_str) | |||
| return error_list | |||
| @@ -21,7 +21,7 @@ from mindinsight.debugger.common.exceptions.exceptions import DebuggerParamValue | |||
| from mindinsight.debugger.common.log import LOGGER as log | |||
| from mindinsight.debugger.common.utils import ServerStatus, \ | |||
| Streams, is_cst_type | |||
| from mindinsight.debugger.conditionmgr.condition import ConditionIdEnum, TargetTypeEnum | |||
| from mindinsight.debugger.conditionmgr.condition import ConditionIdEnum, TargetTypeEnum, ConditionContext | |||
| from mindinsight.debugger.conditionmgr.recommender import get_basic_node_info | |||
| from mindinsight.debugger.stream_handler.watchpoint_handler import validate_watch_condition | |||
| @@ -78,6 +78,12 @@ class WatchpointOperator: | |||
| validate_watch_condition(self._condition_mgr, watch_condition) | |||
| condition_id = watch_condition.get('id') | |||
| condition = self._condition_mgr.get_condition(condition_id) | |||
| condition_context = ConditionContext(metadata_stream.backend, metadata_stream.step) | |||
| if not condition.is_available(condition_context): | |||
| log.error("Failed to create watchpoint as the condition is not available.") | |||
| raise DebuggerCreateWatchPointError( | |||
| "Failed to create watchpoint as the condition is not available.") | |||
| if condition.supported_target_type in [TargetTypeEnum.ACTIVATION, TargetTypeEnum.GRADIENT, | |||
| TargetTypeEnum.WEIGHT]: | |||
| watch_nodes = get_basic_node_info(condition.supported_target_type.value, self._graph_stream).copy() | |||
| @@ -1 +1 @@ | |||
| {"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} | |||
| {"graph": {"nodes": [{"name": "Default/args0", "full_name": "Default/args0", "type": "Parameter", "input": {}, "output": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3], "has_prev_step": true}], "graph_name": "graph_0"}, {"name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "full_name": "Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190", "type": "Cast", "input": {"Default/TransData-op99": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {}, "slots": [{"slot": "0", "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}, {"name": "Default/TransData-op99", "full_name": "Default/TransData-op99", "type": "TransData", "input": {"Default/args0": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "output": {"Default/network-WithLossCell/_backbone-LeNet5/conv1-Conv2d/Cast-op190": {"shape": [[32, 1, 32, 32]], "edge_type": "data", "independent_layout": false, "data_type": "DT_TENSOR[DT_FLOAT32]", "slot_mapping": [["0", ""]]}}, "slots": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0, "error_list": []}], "statistics": {"overall_max": 6.0, "overall_min": 1.0, "overall_avg": 3.5, "overall_count": 6, "overall_nan_count": 0, "overall_neg_inf_count": 0, "overall_pos_inf_count": 0, "overall_zero_count": 0.0, "overall_neg_zero_count": 0.0, "overall_pos_zero_count": 6.0}, "shape": [2, 3]}], "graph_name": "graph_0"}]}} | |||
| @@ -1 +1 @@ | |||
| {"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "summarized_error_code": 0, "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0}]}], "graph_name": "graph_0"}], "outdated": false} | |||
| {"watch_point_hits": [{"node_name": "Default/TransData-op99", "tensors": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0, "error_list": []}]}], "graph_name": "graph_0"}, {"node_name": "Default/optimizer-Momentum/ApplyMomentum[8]_1/ApplyMomentum-op25", "tensors": [{"slot": "0", "watch_points": [{"id": 1, "watch_condition": {"id": "tensor_too_large", "params": [{"name": "max_gt", "value": 1.0, "actual_value": null}], "abbr": "TL"}, "error_code": 0, "error_list": []}]}], "graph_name": "graph_0"}], "outdated": false} | |||
| @@ -5,11 +5,11 @@ | |||
| "tensors": [ | |||
| { | |||
| "slot": "0", | |||
| "summarized_error_code": 0, | |||
| "watch_points": [ | |||
| { | |||
| "id": 1, | |||
| "error_code": 0, | |||
| "error_list": [], | |||
| "watch_condition": { | |||
| "condition": "MAX_GT", | |||
| "param": 1 | |||
| @@ -193,6 +193,7 @@ class TestDebuggerServer: | |||
| self._server.create_watchpoint({'watch_condition': {'id': 'inf'}}) | |||
| @mock.patch.object(MetadataHandler, 'state', 'waiting') | |||
| @mock.patch.object(MetadataHandler, 'backend', 'GPU') | |||
| @mock.patch.object(GraphHandler, 'get_node_basic_info', return_value=MagicMock()) | |||
| @mock.patch.object(GraphHandler, 'get_node_type', return_value='aggregation_scope') | |||
| @mock.patch.object(WatchpointHandler, 'create_watchpoint') | |||