You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

explain_manager.py 11 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """ExplainManager."""
  16. import os
  17. import threading
  18. import time
  19. from collections import OrderedDict
  20. from datetime import datetime
  21. from typing import Optional
  22. from mindinsight.conf import settings
  23. from mindinsight.datavisual.common import exceptions
  24. from mindinsight.datavisual.common.enums import BaseEnum
  25. from mindinsight.explainer.common.log import logger
  26. from mindinsight.explainer.manager.explain_loader import ExplainLoader
  27. from mindinsight.datavisual.data_access.file_handler import FileHandler
  28. from mindinsight.datavisual.data_transform.summary_watcher import SummaryWatcher
  29. from mindinsight.utils.exceptions import MindInsightException, ParamValueError, UnknownError
  30. _MAX_LOADERS_NUM = 3
  31. class _ExplainManagerStatus(BaseEnum):
  32. """Manager status."""
  33. INIT = 'INIT'
  34. LOADING = 'LOADING'
  35. DONE = 'DONE'
  36. INVALID = 'INVALID'
  37. class ExplainManager:
  38. """ExplainManager."""
  39. def __init__(self, summary_base_dir: str):
  40. self._summary_base_dir = summary_base_dir
  41. self._loader_pool = OrderedDict()
  42. self._loading_status = _ExplainManagerStatus.INIT.value
  43. self._status_mutex = threading.Lock()
  44. self._loader_pool_mutex = threading.Lock()
  45. self._max_loaders_num = _MAX_LOADERS_NUM
  46. self._summary_watcher = SummaryWatcher()
  47. @property
  48. def summary_base_dir(self):
  49. """Return the base directory for summary records."""
  50. return self._summary_base_dir
  51. def start_load_data(self, reload_interval: int = 0):
  52. """
  53. Start individual thread to cache explain_jobs and loading summary data periodically.
  54. Args:
  55. reload_interval (int): Specify the loading period in seconds. If interval == 0, data will only be loaded
  56. once. Default: 0.
  57. """
  58. thread = threading.Thread(target=self._repeat_loading,
  59. name='start_load_thread',
  60. args=(reload_interval,),
  61. daemon=True)
  62. time.sleep(1)
  63. thread.start()
  64. def get_job(self, loader_id: str) -> Optional[ExplainLoader]:
  65. """
  66. Return ExplainLoader given loader_id.
  67. If explain job w.r.t given loader_id is not found, None will be returned.
  68. Args:
  69. loader_id (str): The id of expected ExplainLoader
  70. Return:
  71. explain_job
  72. """
  73. self._check_status_valid()
  74. with self._loader_pool_mutex:
  75. if loader_id in self._loader_pool:
  76. self._loader_pool[loader_id].query_time = datetime.now().timestamp()
  77. self._loader_pool.move_to_end(loader_id, last=False)
  78. return self._loader_pool[loader_id]
  79. try:
  80. loader = self._generate_loader_from_relative_path(loader_id)
  81. loader.query_time = datetime.now().timestamp()
  82. self._add_loader(loader)
  83. self._reload_data_again()
  84. except ParamValueError:
  85. logger.warning('Cannot find summary in path: %s. No explain_job will be returned.', loader_id)
  86. return None
  87. return loader
  88. def get_job_list(self, offset=0, limit=None):
  89. """
  90. Return List of explain jobs. includes job ID, create and update time.
  91. Args:
  92. offset (int): An offset for page. Ex, offset is 0, mean current page is 1. Default value is 0.
  93. limit (int): The max data items for per page. Default value is 10.
  94. Returns:
  95. tuple[total, directories], total indicates the overall number of explain directories and directories
  96. indicate list of summary directory info including the following attributes.
  97. - relative_path (str): Relative path of summary directory, referring to settings.SUMMARY_BASE_DIR,
  98. starting with "./".
  99. - create_time (datetime): Creation time of summary file.
  100. - update_time (datetime): Modification time of summary file.
  101. """
  102. total, dir_infos = \
  103. self._summary_watcher.list_explain_directories(self._summary_base_dir, offset=offset, limit=limit)
  104. return total, dir_infos
  105. def _repeat_loading(self, repeat_interval):
  106. """Periodically loading summary."""
  107. while True:
  108. try:
  109. logger.info('Start to load data, repeat interval: %r.', repeat_interval)
  110. self._load_data()
  111. if not repeat_interval:
  112. return
  113. time.sleep(repeat_interval)
  114. except UnknownError as ex:
  115. logger.exception(ex)
  116. logger.error('Unexpected error happens when loading data. Loading status: %s, loading pool size: %d'
  117. 'Detail: %s', self._loading_status, len(self._loader_pool), str(ex))
  118. def _load_data(self):
  119. """
  120. Prepare loaders in cache and start loading the data from summaries.
  121. Only a limited number of loaders will be cached in terms of updated_time or query_time. The size of cache
  122. pool is determined by _MAX_LOADERS_NUM. When the manager start loading data, only the lastest _MAX_LOADER_NUM
  123. summaries will be loaded in cache. If a cached loader if queries by 'get_job', the query_time of the loader
  124. will be updated as well as the the loader moved to the end of cache. If an uncached summary is queried,
  125. a new loader instance will be generated and put to the end cache.
  126. """
  127. try:
  128. with self._status_mutex:
  129. if self._loading_status == _ExplainManagerStatus.LOADING.value:
  130. logger.info('Current status is %s, will ignore to load data.', self._loading_status)
  131. return
  132. self._loading_status = _ExplainManagerStatus.LOADING.value
  133. self._cache_loaders()
  134. self._execute_loading()
  135. if not self._loader_pool:
  136. self._loading_status = _ExplainManagerStatus.INVALID.value
  137. else:
  138. self._loading_status = _ExplainManagerStatus.DONE.value
  139. logger.info('Load event data end, status: %s, and loader pool size: %d',
  140. self._loading_status, len(self._loader_pool))
  141. except Exception as ex:
  142. self._loading_status = _ExplainManagerStatus.INVALID.value
  143. logger.exception(ex)
  144. raise UnknownError(str(ex))
  145. def _cache_loaders(self):
  146. """Cache explain loader in cache pool."""
  147. dir_map_mtime_dict = []
  148. _, summaries_info = self._summary_watcher.list_explain_directories(self._summary_base_dir)
  149. for summary_info in summaries_info:
  150. summary_path = summary_info.get('relative_path')
  151. summary_update_time = summary_info.get('update_time').timestamp()
  152. if summary_path in self._loader_pool:
  153. summary_update_time = max(summary_update_time, self._loader_pool[summary_path].query_time)
  154. dir_map_mtime_dict.append((summary_info, summary_update_time))
  155. sorted_summaries_info = sorted(dir_map_mtime_dict, key=lambda x: x[1])[-_MAX_LOADERS_NUM:]
  156. with self._loader_pool_mutex:
  157. for summary_info, query_time in sorted_summaries_info:
  158. summary_path = summary_info['relative_path']
  159. if summary_path not in self._loader_pool:
  160. loader = self._generate_loader_from_relative_path(summary_path)
  161. self._add_loader(loader)
  162. else:
  163. self._loader_pool[summary_path].query_time = query_time
  164. self._loader_pool.move_to_end(summary_path, last=False)
  165. def _generate_loader_from_relative_path(self, relative_path: str) -> ExplainLoader:
  166. """Generate explain loader from the given relative path."""
  167. self._check_summary_exist(relative_path)
  168. current_dir = os.path.realpath(FileHandler.join(self._summary_base_dir, relative_path))
  169. loader_id = self._generate_loader_id(relative_path)
  170. loader = ExplainLoader(loader_id=loader_id, summary_dir=current_dir)
  171. return loader
  172. def _add_loader(self, loader):
  173. """add loader to the loader_pool."""
  174. if loader.train_id not in self._loader_pool:
  175. self._loader_pool[loader.train_id] = loader
  176. else:
  177. self._loader_pool.move_to_end(loader.train_id)
  178. while len(self._loader_pool) > self._max_loaders_num:
  179. self._loader_pool.popitem(last=False)
  180. def _execute_loading(self):
  181. """Execute the data loading."""
  182. for loader_id in list(self._loader_pool.keys()):
  183. try:
  184. with self._loader_pool_mutex:
  185. loader = self._loader_pool.get(loader_id, None)
  186. if loader is None:
  187. logger.debug('Loader %r has been deleted, will not load data', loader_id)
  188. return
  189. loader.load()
  190. except MindInsightException as ex:
  191. logger.warning('Data loader %r load data failed. Delete data_loader. Detail: %s', loader_id, ex)
  192. with self._loader_pool_mutex:
  193. self._delete_loader(loader_id)
  194. def _delete_loader(self, loader_id):
  195. """delete loader given loader_id"""
  196. if loader_id in self._loader_pool:
  197. self._loader_pool.pop(loader_id)
  198. logger.debug('delete loader %s', loader_id)
  199. def _check_status_valid(self):
  200. """Check manager status."""
  201. if self._loading_status == _ExplainManagerStatus.INIT.value:
  202. raise exceptions.SummaryLogIsLoading('Data is loading, current status is %s' % self._loading_status)
  203. def _check_summary_exist(self, loader_id):
  204. """Verify thee train_job is existed given loader_id."""
  205. if not self._summary_watcher.is_summary_directory(self._summary_base_dir, loader_id):
  206. raise ParamValueError('Can not find the train job in the manager.')
  207. def _reload_data_again(self):
  208. """Reload the data one more time."""
  209. logger.debug('Start to reload data again.')
  210. thread = threading.Thread(target=self._load_data, name='reload_data_thread')
  211. thread.daemon = False
  212. thread.start()
  213. @staticmethod
  214. def _generate_loader_id(relative_path):
  215. """Generate loader id for given path"""
  216. loader_id = relative_path
  217. return loader_id
  218. EXPLAIN_MANAGER = ExplainManager(summary_base_dir=settings.SUMMARY_BASE_DIR)