# Copyright 2020 Huawei Technologies Co., Ltd # # Licensed under the Apache License, Version 2.0 (the "License"); # you may not use this file except in compliance with the License. # You may obtain a copy of the License at # # http://www.apache.org/licenses/LICENSE-2.0 # # Unless required by applicable law or agreed to in writing, software # distributed under the License is distributed on an "AS IS" BASIS, # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. # See the License for the specific language governing permissions and # limitations under the License. # ============================================================================ """Summary watcher module.""" import os import re import datetime from pathlib import Path from mindinsight.datavisual.common.log import logger from mindinsight.datavisual.common.validation import Validation from mindinsight.datavisual.utils.tools import Counter from mindinsight.utils.exceptions import ParamValueError from mindinsight.utils.exceptions import FileSystemPermissionError class SummaryWatcher: """SummaryWatcher class.""" SUMMARY_FILENAME_REGEX = r'summary\.(?P\d+)' PB_FILENAME_REGEX = r'\.pb$' MAX_SUMMARY_DIR_COUNT = 999 # scan at most 20000 files/directories (approximately 1 seconds) # if overall=False in SummaryWatcher.list_summary_directories # to avoid long-time blocking MAX_SCAN_COUNT = 20000 def list_summary_directories(self, summary_base_dir, overall=True): """ List summary directories within base directory. Args: summary_base_dir (str): Path of summary base directory. overall (bool): Limit the total num of scanning if overall is False. Returns: list, list of summary directory info, each of which including the following attributes. - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR, starting with "./". - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> directories = summary_watcher.list_summary_directories('/summary/base/dir') """ if self._contains_null_byte(summary_base_dir=summary_base_dir): return [] if not os.path.exists(summary_base_dir): logger.warning('Path of summary base directory not exists.') return [] if not os.path.isdir(summary_base_dir): logger.warning('Path of summary base directory is not a valid directory.') return [] summary_dict = {} if not overall: counter = Counter(max_count=self.MAX_SCAN_COUNT) else: counter = Counter() try: entries = os.scandir(summary_base_dir) except PermissionError: logger.error('Path of summary base directory is not accessible.') raise FileSystemPermissionError('Path of summary base directory is not accessible.') for entry in entries: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: break try: counter.add() except ParamValueError: logger.info('Stop further scanning due to overall is False and ' 'number of scanned files exceeds upper limit.') break relative_path = os.path.join('.', '') if entry.is_symlink(): pass elif entry.is_file(): self._update_summary_dict(summary_dict, relative_path, entry) elif entry.is_dir(): full_path = os.path.realpath(os.path.join(summary_base_dir, entry.name)) try: subdir_entries = os.scandir(full_path) except PermissionError: logger.warning('Path of %s under summary base directory is not accessible.', entry.name) continue self._scan_subdir_entries(summary_dict, subdir_entries, entry.name, counter) directories = [{ 'relative_path': key, 'create_time': value['ctime'], 'update_time': value['mtime'], } for key, value in summary_dict.items()] # sort by update time in descending order and relative path in ascending order directories.sort(key=lambda x: (-int(x['update_time'].timestamp()), x['relative_path'])) return directories def _scan_subdir_entries(self, summary_dict, subdir_entries, entry_name, counter): """ Scan subdir entries. Args: summary_dict (dict): Temporary data structure to hold summary directory info. subdir_entries(DirEntry): Directory entry instance. entry_name (str): Name of entry. counter (Counter): An instance of CountLimiter. """ for subdir_entry in subdir_entries: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: break try: counter.add() except ParamValueError: logger.info('Stop further scanning due to overall is False and ' 'number of scanned files exceeds upper limit.') break subdir_relative_path = os.path.join('.', entry_name) if subdir_entry.is_symlink(): pass elif subdir_entry.is_file(): self._update_summary_dict(summary_dict, subdir_relative_path, subdir_entry) def _contains_null_byte(self, **kwargs): """ Check if arg contains null byte. Args: kwargs (Any): Check if arg contains null byte. Returns: bool, indicates if any arg contains null byte. """ for key, value in kwargs.items(): if not isinstance(value, str): continue if '\x00' in value: logger.warning('%s contains null byte \\x00.', key) return True return False def _is_valid_summary_directory(self, summary_base_dir, relative_path): """ Check if the given summary directory is valid. Args: summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . Returns: bool, indicates if summary directory is valid. """ summary_base_dir = os.path.realpath(summary_base_dir) summary_directory = os.path.realpath(os.path.join(summary_base_dir, relative_path)) if summary_base_dir == summary_directory: return True if not os.path.exists(summary_directory): logger.warning('Path of summary directory not exists.') return False if not os.path.isdir(summary_directory): logger.warning('Path of summary directory is not a valid directory.') return False try: Path(summary_directory).relative_to(Path(summary_base_dir)) except ValueError: logger.warning('Relative path %s is not subdirectory of summary_base_dir', relative_path) return False return True def _update_summary_dict(self, summary_dict, relative_path, entry): """ Update summary_dict with ctime and mtime. Args: summary_dict (dict): Temporary data structure to hold summary directory info. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . entry (DirEntry): Directory entry instance needed to check with regular expression. """ summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name) if summary_pattern is None and pb_pattern is None: return if summary_pattern is not None: timestamp = int(summary_pattern.groupdict().get('timestamp')) try: # extract created time from filename ctime = datetime.datetime.fromtimestamp(timestamp).astimezone() except OverflowError: return else: ctime = datetime.datetime.fromtimestamp(entry.stat().st_ctime).astimezone() # extract modified time from filesystem mtime = datetime.datetime.fromtimestamp(entry.stat().st_mtime).astimezone() if relative_path not in summary_dict or summary_dict[relative_path]['ctime'] < ctime: summary_dict[relative_path] = { 'ctime': ctime, 'mtime': mtime, } def is_summary_directory(self, summary_base_dir, relative_path): """ Check if the given summary directory is valid. Args: summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . Returns: bool, indicates if the given summary directory is valid. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> summaries = summary_watcher.is_summary_directory('/summary/base/dir', './job-01') """ if self._contains_null_byte(summary_base_dir=summary_base_dir, relative_path=relative_path): return False if not self._is_valid_summary_directory(summary_base_dir, relative_path): return False summary_directory = os.path.realpath(os.path.join(summary_base_dir, relative_path)) try: entries = os.scandir(summary_directory) except PermissionError: logger.error('Path of summary base directory is not accessible.') raise FileSystemPermissionError('Path of summary base directory is not accessible.') for entry in entries: if entry.is_symlink() or not entry.is_file(): continue summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name) if summary_pattern or pb_pattern: return True return False def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10): """ List summary directories within base directory. Args: summary_base_dir (str): Path of summary base directory. offset (int): An offset for page. Ex, offset is 0, mean current page is 1. Default value is 0. limit (int): The max data items for per page. Default value is 10. Returns: tuple[total, directories], total indicates the overall number of summary directories and directories indicate list of summary directory info including the following attributes. - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR, starting with "./". - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. Raises: ParamValueError, if offset < 0 or limit is out of valid value range. ParamTypeError, if offset or limit is not valid integer. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> total, directories = summary_watcher.list_summary_directories_by_pagination( '/summary/base/dir', offset=0, limit=10) """ offset = Validation.check_offset(offset=offset) limit = Validation.check_limit(limit, min_value=1, max_value=999) directories = self.list_summary_directories(summary_base_dir, overall=False) return len(directories), directories[offset * limit:(offset + 1) * limit] def list_summaries(self, summary_base_dir, relative_path='./'): """ Get info of latest summary file within the given summary directory. Args: summary_base_dir (str): Path of summary base directory. relative_path (str): Relative path of summary directory, referring to summary base directory, starting with "./" . Returns: list, list of summary file including the following attributes. - file_name (str): Summary file name. - create_time (datetime): Creation time of summary file. - update_time (datetime): Modification time of summary file. Examples: >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher >>> summary_watcher = SummaryWatcher() >>> summaries = summary_watcher.list_summaries('/summary/base/dir', './job-01') """ if self._contains_null_byte(summary_base_dir=summary_base_dir, relative_path=relative_path): return [] if not self._is_valid_summary_directory(summary_base_dir, relative_path): return [] summaries = [] summary_directory = os.path.realpath(os.path.join(summary_base_dir, relative_path)) try: entries = os.scandir(summary_directory) except PermissionError: logger.error('Path of summary directory is not accessible.') raise FileSystemPermissionError('Path of summary directory is not accessible.') for entry in entries: if entry.is_symlink() or not entry.is_file(): continue pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) if pattern is None: continue timestamp = int(pattern.groupdict().get('timestamp')) try: # extract created time from filename ctime = datetime.datetime.fromtimestamp(timestamp).astimezone() except OverflowError: continue # extract modified time from filesystem mtime = datetime.datetime.fromtimestamp(entry.stat().st_mtime).astimezone() summaries.append({ 'file_name': entry.name, 'create_time': ctime, 'update_time': mtime, }) # sort by update time in descending order and filename in ascending order summaries.sort(key=lambda x: (-int(x['update_time'].timestamp()), x['file_name'])) return summaries