You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

computing_resource_mgr.py 9.7 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279
  1. # Copyright 2020-2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Compute resource manager."""
  16. import fractions
  17. import math
  18. import threading
  19. from concurrent import futures
  20. import multiprocessing
  21. from mindinsight.utils.log import setup_logger
  22. from mindinsight.utils.constant import GeneralErrors
  23. from mindinsight.utils.exceptions import MindInsightException
  24. _MP_CONTEXT = multiprocessing.get_context(method="forkserver")
  25. terminating = False
  26. class ComputingResourceManager:
  27. """
  28. Manager for computing resources.
  29. This class provides executors for computing tasks. Executors can only be used once.
  30. Args:
  31. executors_cnt (int): Number of executors to be provided by this class.
  32. max_processes_cnt (int): Max number of processes to be used for computing.
  33. """
  34. def __init__(self, executors_cnt=1, max_processes_cnt=4):
  35. self._max_processes_cnt = max_processes_cnt
  36. self._executors_cnt = executors_cnt
  37. self._lock = threading.Lock()
  38. self._executors = {
  39. ind: Executor(
  40. self, executor_id=ind,
  41. available_workers=fractions.Fraction(self._max_processes_cnt, self._executors_cnt))
  42. for ind in range(self._executors_cnt)
  43. }
  44. self._remaining_executors = len(self._executors)
  45. self._backend = futures.ProcessPoolExecutor(max_workers=max_processes_cnt, mp_context=_MP_CONTEXT)
  46. self.logger = setup_logger("utils", "utils")
  47. self.logger.info("Initialized ComputingResourceManager with executors_cnt=%s, max_processes_cnt=%s.",
  48. executors_cnt, max_processes_cnt)
  49. def __enter__(self):
  50. """This method is not thread safe."""
  51. return self
  52. def __exit__(self, exc_type, exc_val, exc_tb):
  53. """
  54. This should not block because every executor have waited. If it blocks, there may be some problem.
  55. This method is not thread safe.
  56. """
  57. self._backend.shutdown()
  58. def get_executor(self):
  59. """
  60. Get an executor.
  61. Returns:
  62. Executor, which can be used for submitting tasks.
  63. Raises:
  64. ComputeResourceManagerException: when no more executor is available.
  65. """
  66. with self._lock:
  67. self._remaining_executors -= 1
  68. if self._remaining_executors < 0:
  69. raise ComputingResourceManagerException("No more executors.")
  70. return self._executors[self._remaining_executors]
  71. def destroy_executor(self, executor_id):
  72. """
  73. Destroy an executor to reuse it's workers.
  74. Args:
  75. executor_id (int): Id of the executor to be destroyed.
  76. """
  77. with self._lock:
  78. released_workers = self._executors[executor_id].available_workers
  79. self._executors.pop(executor_id)
  80. remaining_executors = len(self._executors)
  81. self.logger.info("Destroy executor %s. Will release %s worker(s). Remaining executors: %s.",
  82. executor_id, released_workers, remaining_executors)
  83. if not remaining_executors:
  84. return
  85. for executor in self._executors.values():
  86. executor.add_worker(
  87. fractions.Fraction(
  88. released_workers.numerator,
  89. released_workers.denominator * remaining_executors))
  90. def submit(self, *args, **kwargs):
  91. """
  92. Submit a task.
  93. See concurrent.futures.Executor.submit() for details.
  94. This method should only be called by Executor. Users should not call this method directly.
  95. """
  96. with self._lock:
  97. if not terminating:
  98. return self._backend.submit(*args, **kwargs)
  99. self.logger.info('Got submit after process pool shutdown.')
  100. return None
  101. class ComputingResourceManagerException(MindInsightException):
  102. """
  103. Indicates a computing resource error has occurred.
  104. This exception should not be presented to end users.
  105. Args:
  106. msg (str): Exception message.
  107. """
  108. def __init__(self, msg):
  109. super().__init__(error=GeneralErrors.COMPUTING_RESOURCE_ERROR, message=msg)
  110. class WrappedFuture:
  111. """
  112. Wrap Future objects with custom logics to release compute slots.
  113. Args:
  114. executor (Executor): The executor which generates this future.
  115. original_future (futures.Future): Original future object.
  116. """
  117. def __init__(self, executor, original_future: futures.Future):
  118. self._original_future = original_future
  119. self._executor = executor
  120. self.logger = setup_logger("utils", "utils")
  121. def add_done_callback(self, callback):
  122. """
  123. Add done callback.
  124. See futures.Future.add_done_callback() for details.
  125. """
  126. def _wrapped_callback(*args, **kwargs):
  127. self.logger.debug("Future callback called.")
  128. try:
  129. return callback(*args, **kwargs)
  130. finally:
  131. self._executor.release_slot()
  132. self._executor.remove_done_future(self._original_future)
  133. self._original_future.add_done_callback(_wrapped_callback)
  134. class Executor:
  135. """
  136. Task executor.
  137. Args:
  138. mgr (ComputingResourceManager): The ComputingResourceManager that generates this executor.
  139. executor_id (int): Executor id.
  140. available_workers (fractions.Fraction): Available workers.
  141. """
  142. def __init__(self, mgr: ComputingResourceManager, executor_id, available_workers):
  143. self._mgr = mgr
  144. self.closed = False
  145. self._available_workers = available_workers
  146. self._effective_workers = self._calc_effective_workers(self._available_workers)
  147. self._slots = threading.Semaphore(value=self._effective_workers)
  148. self._id = executor_id
  149. self._futures = set()
  150. self._lock = threading.Lock()
  151. self.logger = setup_logger("utils", "utils")
  152. self.logger.debug("Available workers: %s.", available_workers)
  153. def __enter__(self):
  154. """This method is not thread safe."""
  155. if self.closed:
  156. raise ComputingResourceManagerException("Can not reopen closed executor.")
  157. return self
  158. def __exit__(self, exc_type, exc_val, exc_tb):
  159. """This method is not thread safe."""
  160. self._close()
  161. def submit(self, *args, **kwargs):
  162. """
  163. Submit task.
  164. See concurrent.futures.Executor.submit() for details. This method is not thread safe.
  165. """
  166. self.logger.debug("Task submitted to executor %s.", self._id)
  167. if self.closed:
  168. raise ComputingResourceManagerException("Cannot submit task to a closed executor.")
  169. # Thread will wait on acquire().
  170. self._slots.acquire()
  171. future = self._mgr.submit(*args, **kwargs)
  172. if future is None:
  173. return None
  174. # set.add is atomic in c-python.
  175. self._futures.add(future)
  176. return WrappedFuture(self, future)
  177. def release_slot(self):
  178. """
  179. Release a slot for new tasks to be submitted.
  180. Semaphore is itself thread safe, so no lock is needed.
  181. This method should only be called by ExecutorFuture.
  182. """
  183. self._slots.release()
  184. def remove_done_future(self, future):
  185. """
  186. Remove done futures so the executor will not track them.
  187. This method should only be called by WrappedFuture.
  188. """
  189. # set.remove is atomic in c-python so no lock is needed.
  190. self._futures.remove(future)
  191. @staticmethod
  192. def _calc_effective_workers(available_workers):
  193. return 1 if available_workers <= 1 else math.floor(available_workers)
  194. def _close(self):
  195. self.closed = True
  196. self.logger.debug("Executor is being closed, futures to wait: %s", self._futures)
  197. futures.wait(self._futures)
  198. self.logger.debug("Executor wait futures completed.")
  199. self._mgr.destroy_executor(self._id)
  200. self.logger.debug("Executor is closed.")
  201. @property
  202. def available_workers(self):
  203. """Get available workers."""
  204. with self._lock:
  205. return self._available_workers
  206. def add_worker(self, added_available_workers):
  207. """This method should only be called by ComputeResourceManager."""
  208. self.logger.debug("Add worker: %s", added_available_workers)
  209. with self._lock:
  210. self._available_workers += added_available_workers
  211. new_effective_workers = self._calc_effective_workers(self._available_workers)
  212. if new_effective_workers > self._effective_workers:
  213. for _ in range(new_effective_workers - self._effective_workers):
  214. self._slots.release()
  215. self._effective_workers = new_effective_workers
  216. def wait_all_tasks_finish(self):
  217. """
  218. Wait all tasks finish.
  219. This method is not thread safe.
  220. """
  221. futures.wait(self._futures)
  222. def terminate():
  223. """Set the terminating flag."""
  224. global terminating
  225. terminating = True