From f317763313c39529ed5fccc5114dc6c78b1db709 Mon Sep 17 00:00:00 2001 From: jiangshuqiang <962978787@qq.com> Date: Thu, 29 Oct 2020 22:38:15 +0800 Subject: [PATCH] fix when image tag name is too long, the log is unreasonable add info when crc check failed. --- mindinsight/datavisual/common/exceptions.py | 9 ++++ .../data_transform/ms_data_loader.py | 24 ++++++--- .../summary_parser/event_parser.py | 50 ++++++++++++------- mindinsight/scripts/parse_summary.py | 29 ++++++----- mindinsight/utils/constant.py | 1 + 5 files changed, 75 insertions(+), 38 deletions(-) diff --git a/mindinsight/datavisual/common/exceptions.py b/mindinsight/datavisual/common/exceptions.py index 47322d4a..c27c442a 100644 --- a/mindinsight/datavisual/common/exceptions.py +++ b/mindinsight/datavisual/common/exceptions.py @@ -64,6 +64,15 @@ class CRCFailedError(MindInsightException): http_code=400) +class CRCLengthFailedError(MindInsightException): + """CRC fail, record corrupted.""" + def __init__(self): + error_msg = 'CRC Length Failed.' + super(CRCLengthFailedError, self).__init__(DataVisualErrors.CRC_LENGTH_FAILED, + error_msg, + http_code=400) + + class SummaryLogIsLoading(MindInsightException): """Data is loading.""" diff --git a/mindinsight/datavisual/data_transform/ms_data_loader.py b/mindinsight/datavisual/data_transform/ms_data_loader.py index d2d81395..d1d03039 100644 --- a/mindinsight/datavisual/data_transform/ms_data_loader.py +++ b/mindinsight/datavisual/data_transform/ms_data_loader.py @@ -20,6 +20,7 @@ Each instance will read an entire run, a run can contain one or more log file. """ import re +import time import struct from google.protobuf.message import DecodeError @@ -46,6 +47,7 @@ from mindinsight.utils.exceptions import UnknownError HEADER_SIZE = 8 CRC_STR_SIZE = 4 MAX_EVENT_STRING = 500000000 +RETRY_TIMES = 2 class MSDataLoader: @@ -368,10 +370,12 @@ class _SummaryParser(_Parser): Returns: bool, True if the summary file is finished loading. """ + crc_check_time = 0 while True: start_offset = file_handler.offset try: event_str = self._event_load(file_handler) + crc_check_time = 0 if event_str is None: file_handler.reset_offset(start_offset) return True @@ -399,6 +403,18 @@ class _SummaryParser(_Parser): future.add_done_callback(exception_no_raise_wrapper(_add_tensor_event_callback)) return False + except exceptions.CRCLengthFailedError: + if crc_check_time > RETRY_TIMES: + logger.warning( + "Check crc length failed, please check the summary file integrity, " + "the file may be in transfer, file_path: %s, offset=%s.", + file_handler.file_path, start_offset) + return True + logger.info( + "Check crc failed, retrying %d/%d times.", crc_check_time + 1, RETRY_TIMES + 1) + file_handler.reset_offset(start_offset) + crc_check_time += 1 + time.sleep(0.5) except exceptions.CRCFailedError: file_handler.reset_offset(start_offset) logger.warning("Check crc faild and ignore this file, file_path=%s, " @@ -432,9 +448,7 @@ class _SummaryParser(_Parser): header_crc_str = '' if len(header_str) != HEADER_SIZE or len(header_crc_str) != CRC_STR_SIZE: - logger.warning("Check header size and crc, record truncated at offset %s, " - "file_path=%s.", file_handler.offset, file_handler.file_path) - return None + raise exceptions.CRCLengthFailedError if not crc32.CheckValueAgainstData(header_crc_str, header_str, HEADER_SIZE): raise exceptions.CRCFailedError() @@ -450,9 +464,7 @@ class _SummaryParser(_Parser): event_crc_str = '' if len(event_str) != event_len or len(event_crc_str) != CRC_STR_SIZE: - logger.warning("Check event crc, record truncated at offset %d, file_path: %s.", - file_handler.offset, file_handler.file_path) - return None + raise exceptions.CRCLengthFailedError if not crc32.CheckValueAgainstData(event_crc_str, event_str, event_len): raise exceptions.CRCFailedError() diff --git a/mindinsight/datavisual/data_transform/summary_parser/event_parser.py b/mindinsight/datavisual/data_transform/summary_parser/event_parser.py index 88b2f073..adaadef7 100644 --- a/mindinsight/datavisual/data_transform/summary_parser/event_parser.py +++ b/mindinsight/datavisual/data_transform/summary_parser/event_parser.py @@ -18,6 +18,7 @@ Scalar Writer. This module write scalar into a csv file. """ import os +import time import struct from google.protobuf.message import DecodeError @@ -36,6 +37,7 @@ MAX_EVENT_STRING = 500000000 SCALAR = 'scalar_value' IMAGE = 'image' INFO_INTERVAL = 10 +RETRY_TIMES = 2 class EventParser(): @@ -45,7 +47,6 @@ class EventParser(): self._output = output self._scalar_writer = ScalarWriter(self._output) self._image_writer = ImageWriter(FileHandler.join(self._output, IMAGE)) - self._current = 0 self._file_size = 0 self._process_info = 0 self._image_check = False @@ -63,15 +64,14 @@ class EventParser(): parse_summary_logger.info("Loading %s.", self.summary_file) result = self._load(summary_file_handler) - parse_summary_logger.info("Writing scalar.csv") - self._scalar_writer.write() - warning = '' if not self._scalar_check: warning = warning + " the summary file contains no scalar value." if not self._image_check: warning = warning + " the summary file contains no image." if result: + parse_summary_logger.info("Writing parsed data into scalar.csv") + self._scalar_writer.write() if warning: parse_summary_logger.warning(warning) parse_summary_logger.info("Finished loading %s.", self.summary_file) @@ -86,9 +86,12 @@ class EventParser(): Returns: bool, True if the summary file is finished loading. """ + crc_check_time = 0 while True: + start_offset = file_handler.offset try: event_str = self._event_load(file_handler) + crc_check_time = 0 if event_str is None: return True if len(event_str) > MAX_EVENT_STRING: @@ -96,10 +99,23 @@ class EventParser(): file_handler.file_path, len(event_str), MAX_EVENT_STRING) continue self._event_parse(event_str) + except exceptions.CRCLengthFailedError: + if crc_check_time > RETRY_TIMES: + parse_summary_logger.error( + "Check crc length failed, please check the summary file integrity, " + "the file may be in transfer, file_path: %s, offset=%s.", + file_handler.file_path, start_offset) + return True + parse_summary_logger.warning( + "Check crc failed, retrying %d/%d times.", crc_check_time + 1, RETRY_TIMES + 1) + file_handler.reset_offset(start_offset) + crc_check_time += 1 + time.sleep(0.5) except exceptions.CRCFailedError: - parse_summary_logger.error("Check crc faild, file_path=%s, offset=%s.", file_handler.file_path, - file_handler.offset) - return False + parse_summary_logger.error( + "Check crc failed, the file may have been modified, file_path=%s, offset=%s.", + file_handler.file_path, start_offset) + return True except (OSError, DecodeError, exceptions.MindInsightException) as ex: parse_summary_logger.error("Parse file fail, detail: %r, file path: %s.", str(ex), file_handler.file_path) @@ -126,9 +142,7 @@ class EventParser(): header_crc_str = '' if len(header_str) != HEADER_SIZE or len(header_crc_str) != CRC_STR_SIZE: - parse_summary_logger.error("Check header size and crc, record truncated at offset %s, file_path=%s.", - file_handler.offset, file_handler.file_path) - return None + raise exceptions.CRCLengthFailedError if not crc32.CheckValueAgainstData(header_crc_str, header_str, HEADER_SIZE): raise exceptions.CRCFailedError() @@ -145,16 +159,18 @@ class EventParser(): event_crc_str = '' if len(event_str) != event_len or len(event_crc_str) != CRC_STR_SIZE: - parse_summary_logger.error("Check event crc, record truncated at offset %d, file_path: %s.", - file_handler.offset, file_handler.file_path) - return None + raise exceptions.CRCLengthFailedError + if not crc32.CheckValueAgainstData(event_crc_str, event_str, event_len): raise exceptions.CRCFailedError() - self._current += HEADER_SIZE + 2 * CRC_STR_SIZE + event_len - if self._current >= self._process_info: - parse_summary_logger.info("Current parsing process: %d/%d, %d%%.", self._current, self._file_size, - 100 * self._current // self._file_size) + + current_offset = file_handler.offset + if current_offset >= self._process_info: + parse_summary_logger.info("Current parsing process: %d/%d, %d%%.", current_offset, self._file_size, + 100 * current_offset // os.path.getsize(self.summary_file)) self._process_info += self._file_size // INFO_INTERVAL + if self._process_info > os.path.getsize(self.summary_file): + self._process_info = os.path.getsize(self.summary_file) return event_str def _event_parse(self, event_str): diff --git a/mindinsight/scripts/parse_summary.py b/mindinsight/scripts/parse_summary.py index 6cabe810..20126a28 100644 --- a/mindinsight/scripts/parse_summary.py +++ b/mindinsight/scripts/parse_summary.py @@ -26,7 +26,7 @@ from mindinsight.datavisual.data_transform.ms_data_loader import _SummaryParser from mindinsight.datavisual.data_transform.summary_parser.event_parser import EventParser -class FileDirAction(argparse.Action): +class DirAction(argparse.Action): """File directory action class definition.""" @staticmethod @@ -72,7 +72,7 @@ class OutputDirAction(argparse.Action): values (object): Argument values with type depending on argument definition. option_string (str): Optional string for specific argument name. Default: None. """ - output = FileDirAction.check_path(values) + output = DirAction.check_path(values) setattr(namespace, self.dest, output) @@ -94,7 +94,7 @@ class Command(BaseCommand): parser.add_argument( '--summary-dir', type=str, - action=FileDirAction, + action=DirAction, default=os.path.realpath(os.getcwd()), help=""" Optional, specify path for summary file directory. @@ -120,7 +120,7 @@ class Command(BaseCommand): """ try: date_time = datetime.datetime.now().strftime('output_%Y%m%d_%H%M%S_%f') - output_filename = os.path.join(args.output, date_time) + output_path = os.path.join(args.output, date_time) summary_dir = args.summary_dir @@ -140,10 +140,10 @@ class Command(BaseCommand): summary_file = FileHandler.join(summary_dir, filename) if not (self._check_filepath(summary_file) and self._check_create_filepath( - output_filename) and self._check_create_filepath(FileHandler.join(output_filename, 'image'))): + output_path) and self._check_create_filepath(FileHandler.join(output_path, 'image'))): return - eventparser = EventParser(summary_file, output_filename) + eventparser = EventParser(summary_file, output_path) eventparser.parse() except Exception as ex: @@ -158,15 +158,13 @@ class Command(BaseCommand): Args: filepath (str): File path. """ - if os.path.exists(filepath): - if not os.path.isfile(filepath): - parse_summary_logger.error('Summary file %s is not a valid file.', filepath) - return False - if not os.access(filepath, os.R_OK): - parse_summary_logger.error('Path %s is not accessible, please check the file-authority.', filepath) - return True - parse_summary_logger.error('Summary file %s not exists.', filepath) - return False + if not os.path.isfile(filepath): + parse_summary_logger.error('Summary file %s is not a valid file.', filepath) + return False + if not os.access(filepath, os.R_OK): + parse_summary_logger.error('Path %s is not accessible, please check the file-authority.', filepath) + return False + return True @staticmethod def _check_dirpath(filepath): @@ -182,6 +180,7 @@ class Command(BaseCommand): return False if not os.access(filepath, os.R_OK | os.X_OK): parse_summary_logger.error('Path %s is not accessible, please check the file-authority.', filepath) + return False return True parse_summary_logger.error('Summary directory %s not exists.', filepath) return False diff --git a/mindinsight/utils/constant.py b/mindinsight/utils/constant.py index 00622187..e841cb5a 100644 --- a/mindinsight/utils/constant.py +++ b/mindinsight/utils/constant.py @@ -82,6 +82,7 @@ class DataVisualErrors(Enum): TENSOR_NOT_EXIST = 18 MAX_RESPONSE_DATA_EXCEEDED_ERROR = 19 STEP_TENSOR_DATA_NOT_IN_CACHE = 20 + CRC_LENGTH_FAILED = 21 class ScriptConverterErrors(Enum):