|
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344 |
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """Summary watcher module."""
-
- import os
- import re
- import datetime
- from pathlib import Path
-
- from mindinsight.datavisual.common.log import logger
- from mindinsight.datavisual.common.validation import Validation
- from mindinsight.utils.exceptions import FileSystemPermissionError
-
-
- class SummaryWatcher:
- """SummaryWatcher class."""
-
- SUMMARY_FILENAME_REGEX = r'summary\.(?P<timestamp>\d+)'
- PB_FILENAME_REGEX = r'\.pb$'
- MAX_SUMMARY_DIR_COUNT = 999
-
- # scan at most 20000 files/directories (approximately 1 seconds)
- # if overall=False in SummaryWatcher.list_summary_directories
- # to avoid long-time blocking
- MAX_SCAN_COUNT = 20000
-
- def list_summary_directories(self, summary_base_dir, overall=True):
- """
- List summary directories within base directory.
-
- Args:
- summary_base_dir (str): Path of summary base directory.
-
- Returns:
- list, list of summary directory info, each of which including the following attributes.
- - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR,
- starting with "./".
- - create_time (datetime): Creation time of summary file.
- - update_time (datetime): Modification time of summary file.
-
- Examples:
- >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
- >>> summary_watcher = SummaryWatcher()
- >>> directories = summary_watcher.list_summary_directories('/summary/base/dir')
- """
- if self._contains_null_byte(summary_base_dir=summary_base_dir):
- return []
-
- if not os.path.exists(summary_base_dir):
- logger.warning('Path of summary base directory not exists.')
- return []
-
- if not os.path.isdir(summary_base_dir):
- logger.warning('Path of summary base directory is not a valid directory.')
- return []
-
- summary_dict = {}
- scan_count = 0
-
- try:
- entries = os.scandir(summary_base_dir)
- except PermissionError:
- logger.error('Path of summary base directory is not accessible.')
- raise FileSystemPermissionError('Path of summary base directory is not accessible.')
-
- for entry in entries:
- if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT:
- break
- relative_path = os.path.join('.', '')
- if entry.is_symlink():
- pass
- elif entry.is_file():
- self._update_summary_dict(summary_dict, relative_path, entry)
- elif entry.is_dir():
- full_path = os.path.realpath(os.path.join(summary_base_dir, entry.name))
-
- try:
- subdir_entries = os.scandir(full_path)
- except PermissionError:
- logger.warning('Path of %s under summary base directory is not accessible.', entry.name)
- else:
- for subdir_entry in subdir_entries:
- if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT:
- break
- subdir_relative_path = os.path.join('.', entry.name)
- if subdir_entry.is_symlink():
- pass
- elif subdir_entry.is_file():
- self._update_summary_dict(summary_dict, subdir_relative_path, subdir_entry)
-
- scan_count += 1
- if not overall and scan_count >= self.MAX_SCAN_COUNT:
- break
-
- scan_count += 1
- if not overall and scan_count >= self.MAX_SCAN_COUNT:
- logger.info('Stop further scanning due to overall is False and '
- 'number of scanned files exceeds upper limit.')
- break
-
- directories = [{
- 'relative_path': key,
- 'create_time': value['ctime'],
- 'update_time': value['mtime'],
- } for key, value in summary_dict.items()]
-
- # sort by update time in descending order and relative path in ascending order
- directories.sort(key=lambda x: (-int(x['update_time'].timestamp()), x['relative_path']))
-
- return directories
-
- def _contains_null_byte(self, **kwargs):
- """
- Check if arg contains null byte.
-
- Args:
- kwargs (Any): Check if arg contains null byte.
-
- Returns:
- bool, indicates if any arg contains null byte.
- """
- for key, value in kwargs.items():
- if not isinstance(value, str):
- continue
- if '\x00' in value:
- logger.warning('%s contains null byte \\x00.', key)
- return True
-
- return False
-
- def _is_valid_summary_directory(self, summary_base_dir, relative_path):
- """
- Check if the given summary directory is valid.
-
- Args:
- summary_base_dir (str): Path of summary base directory.
- relative_path (str): Relative path of summary directory, referring to summary base directory,
- starting with "./" .
-
- Returns:
- bool, indicates if summary directory is valid.
- """
- summary_base_dir = os.path.realpath(summary_base_dir)
- summary_directory = os.path.realpath(os.path.join(summary_base_dir, relative_path))
- if summary_base_dir == summary_directory:
- return True
-
- if not os.path.exists(summary_directory):
- logger.warning('Path of summary directory not exists.')
- return False
-
- if not os.path.isdir(summary_directory):
- logger.warning('Path of summary directory is not a valid directory.')
- return False
-
- try:
- Path(summary_directory).relative_to(Path(summary_base_dir))
- except ValueError:
- logger.warning('Relative path %s is not subdirectory of summary_base_dir', relative_path)
- return False
-
- return True
-
- def _update_summary_dict(self, summary_dict, relative_path, entry):
- """
- Update summary_dict with ctime and mtime.
-
- Args:
- summary_dict (dict): Temporary data structure to hold summary directory info.
- relative_path (str): Relative path of summary directory, referring to summary base directory,
- starting with "./" .
- entry (DirEntry): Directory entry instance needed to check with regular expression.
- """
- summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name)
- pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name)
- if summary_pattern is None and pb_pattern is None:
- return
-
- if summary_pattern is not None:
- timestamp = int(summary_pattern.groupdict().get('timestamp'))
- try:
- # extract created time from filename
- ctime = datetime.datetime.fromtimestamp(timestamp).astimezone()
- except OverflowError:
- return
- else:
- ctime = datetime.datetime.fromtimestamp(entry.stat().st_ctime).astimezone()
-
- # extract modified time from filesystem
- mtime = datetime.datetime.fromtimestamp(entry.stat().st_mtime).astimezone()
-
- if relative_path not in summary_dict or summary_dict[relative_path]['ctime'] < ctime:
- summary_dict[relative_path] = {
- 'ctime': ctime,
- 'mtime': mtime,
- }
-
- def is_summary_directory(self, summary_base_dir, relative_path):
- """
- Check if the given summary directory is valid.
-
- Args:
- summary_base_dir (str): Path of summary base directory.
- relative_path (str): Relative path of summary directory, referring to summary base directory,
- starting with "./" .
-
- Returns:
- bool, indicates if the given summary directory is valid.
-
- Examples:
- >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
- >>> summary_watcher = SummaryWatcher()
- >>> summaries = summary_watcher.is_summary_directory('/summary/base/dir', './job-01')
- """
- if self._contains_null_byte(summary_base_dir=summary_base_dir, relative_path=relative_path):
- return False
-
- if not self._is_valid_summary_directory(summary_base_dir, relative_path):
- return False
-
- summary_directory = os.path.realpath(os.path.join(summary_base_dir, relative_path))
- try:
- entries = os.scandir(summary_directory)
- except PermissionError:
- logger.error('Path of summary base directory is not accessible.')
- raise FileSystemPermissionError('Path of summary base directory is not accessible.')
-
- for entry in entries:
- if entry.is_symlink() or not entry.is_file():
- continue
- summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name)
- pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name)
- if summary_pattern or pb_pattern:
- return True
-
- return False
-
- def list_summary_directories_by_pagination(self, summary_base_dir, offset=0, limit=10):
- """
- List summary directories within base directory.
-
- Args:
- summary_base_dir (str): Path of summary base directory.
- offset (int): An offset for page. Ex, offset is 0, mean current page is 1. Default value is 0.
- limit (int): The max data items for per page. Default value is 10.
-
- Returns:
- tuple[total, directories], total indicates the overall number of summary directories and directories
- indicate list of summary directory info including the following attributes.
- - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR,
- starting with "./".
- - create_time (datetime): Creation time of summary file.
- - update_time (datetime): Modification time of summary file.
-
- Raises:
- ParamValueError, if offset < 0 or limit is out of valid value range.
- ParamTypeError, if offset or limit is not valid integer.
-
- Examples:
- >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
- >>> summary_watcher = SummaryWatcher()
- >>> total, directories = summary_watcher.list_summary_directories_by_pagination(
- '/summary/base/dir', offset=0, limit=10)
- """
- offset = Validation.check_offset(offset=offset)
- limit = Validation.check_limit(limit, min_value=1, max_value=999)
-
- directories = self.list_summary_directories(summary_base_dir, overall=False)
- return len(directories), directories[offset * limit:(offset + 1) * limit]
-
- def list_summaries(self, summary_base_dir, relative_path='./'):
- """
- Get info of latest summary file within the given summary directory.
-
- Args:
- summary_base_dir (str): Path of summary base directory.
- relative_path (str): Relative path of summary directory, referring to summary base directory,
- starting with "./" .
-
- Returns:
- list, list of summary file including the following attributes.
- - file_name (str): Summary file name.
- - create_time (datetime): Creation time of summary file.
- - update_time (datetime): Modification time of summary file.
-
- Examples:
- >>> from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
- >>> summary_watcher = SummaryWatcher()
- >>> summaries = summary_watcher.list_summaries('/summary/base/dir', './job-01')
- """
- if self._contains_null_byte(summary_base_dir=summary_base_dir, relative_path=relative_path):
- return []
-
- if not self._is_valid_summary_directory(summary_base_dir, relative_path):
- return []
-
- summaries = []
- summary_directory = os.path.realpath(os.path.join(summary_base_dir, relative_path))
- try:
- entries = os.scandir(summary_directory)
- except PermissionError:
- logger.error('Path of summary directory is not accessible.')
- raise FileSystemPermissionError('Path of summary directory is not accessible.')
-
- for entry in entries:
- if entry.is_symlink() or not entry.is_file():
- continue
-
- pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name)
- if pattern is None:
- continue
-
- timestamp = int(pattern.groupdict().get('timestamp'))
- try:
- # extract created time from filename
- ctime = datetime.datetime.fromtimestamp(timestamp).astimezone()
- except OverflowError:
- continue
-
- # extract modified time from filesystem
- mtime = datetime.datetime.fromtimestamp(entry.stat().st_mtime).astimezone()
-
- summaries.append({
- 'file_name': entry.name,
- 'create_time': ctime,
- 'update_time': mtime,
- })
-
- # sort by update time in descending order and filename in ascending order
- summaries.sort(key=lambda x: (-int(x['update_time'].timestamp()), x['file_name']))
-
- return summaries
|