You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

computing_resource_mgr.py 9.1 kB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261
  1. # Copyright 2020 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """Compute resource manager."""
  16. import fractions
  17. import math
  18. import threading
  19. from concurrent import futures
  20. from mindinsight.utils.log import utils_logger as logger
  21. from mindinsight.utils.constant import GeneralErrors
  22. from mindinsight.utils.exceptions import MindInsightException
  23. class ComputingResourceManager:
  24. """
  25. Manager for computing resources.
  26. This class provides executors for computing tasks. Executors can only be used once.
  27. Args:
  28. executors_cnt (int): Number of executors to be provided by this class.
  29. max_processes_cnt (int): Max number of processes to be used for computing.
  30. """
  31. def __init__(self, executors_cnt, max_processes_cnt):
  32. self._max_processes_cnt = max_processes_cnt
  33. self._executors_cnt = executors_cnt
  34. self._lock = threading.Lock()
  35. self._executors = {
  36. ind: Executor(
  37. self, executor_id=ind,
  38. available_workers=fractions.Fraction(self._max_processes_cnt, self._executors_cnt))
  39. for ind in range(self._executors_cnt)
  40. }
  41. self._remaining_executors = len(self._executors)
  42. self._backend = futures.ProcessPoolExecutor(max_workers=max_processes_cnt)
  43. logger.info("Initialized ComputingResourceManager with executors_cnt=%s, max_processes_cnt=%s.",
  44. executors_cnt, max_processes_cnt)
  45. def __enter__(self):
  46. """This method is not thread safe."""
  47. return self
  48. def __exit__(self, exc_type, exc_val, exc_tb):
  49. """
  50. This should not block because every executor have waited. If it blocks, there may be some problem.
  51. This method is not thread safe.
  52. """
  53. self._backend.shutdown()
  54. def get_executor(self):
  55. """
  56. Get an executor.
  57. Returns:
  58. Executor, which can be used for submitting tasks.
  59. Raises:
  60. ComputeResourceManagerException: when no more executor is available.
  61. """
  62. with self._lock:
  63. self._remaining_executors -= 1
  64. if self._remaining_executors < 0:
  65. raise ComputingResourceManagerException("No more executors.")
  66. return self._executors[self._remaining_executors]
  67. def destroy_executor(self, executor_id):
  68. """
  69. Destroy an executor to reuse it's workers.
  70. Args:
  71. executor_id (int): Id of the executor to be destroyed.
  72. """
  73. with self._lock:
  74. released_workers = self._executors[executor_id].available_workers
  75. self._executors.pop(executor_id)
  76. remaining_executors = len(self._executors)
  77. logger.info("Destroy executor %s. Will release %s worker(s). Remaining executors: %s.",
  78. executor_id, released_workers, remaining_executors)
  79. if not remaining_executors:
  80. return
  81. for executor in self._executors.values():
  82. executor.add_worker(
  83. fractions.Fraction(
  84. released_workers.numerator,
  85. released_workers.denominator * remaining_executors))
  86. def submit(self, *args, **kwargs):
  87. """
  88. Submit a task.
  89. See concurrent.futures.Executor.submit() for details.
  90. This method should only be called by Executor. Users should not call this method directly.
  91. """
  92. with self._lock:
  93. return self._backend.submit(*args, **kwargs)
  94. class ComputingResourceManagerException(MindInsightException):
  95. """
  96. Indicates a computing resource error has occurred.
  97. This exception should not be presented to end users.
  98. Args:
  99. msg (str): Exception message.
  100. """
  101. def __init__(self, msg):
  102. super().__init__(error=GeneralErrors.COMPUTING_RESOURCE_ERROR, message=msg)
  103. class WrappedFuture:
  104. """
  105. Wrap Future objects with custom logics to release compute slots.
  106. Args:
  107. executor (Executor): The executor which generates this future.
  108. original_future (futures.Future): Original future object.
  109. """
  110. def __init__(self, executor, original_future: futures.Future):
  111. self._original_future = original_future
  112. self._executor = executor
  113. def add_done_callback(self, callback):
  114. """
  115. Add done callback.
  116. See futures.Future.add_done_callback() for details.
  117. """
  118. def _wrapped_callback(*args, **kwargs):
  119. logger.debug("Future callback called.")
  120. try:
  121. return callback(*args, **kwargs)
  122. finally:
  123. self._executor.release_slot()
  124. self._executor.remove_done_future(self._original_future)
  125. self._original_future.add_done_callback(_wrapped_callback)
  126. class Executor:
  127. """
  128. Task executor.
  129. Args:
  130. mgr (ComputingResourceManager): The ComputingResourceManager that generates this executor.
  131. executor_id (int): Executor id.
  132. available_workers (fractions.Fraction): Available workers.
  133. """
  134. def __init__(self, mgr: ComputingResourceManager, executor_id, available_workers):
  135. self._mgr = mgr
  136. self.closed = False
  137. self._available_workers = available_workers
  138. self._effective_workers = self._calc_effective_workers(self._available_workers)
  139. self._slots = threading.Semaphore(value=self._effective_workers)
  140. self._id = executor_id
  141. self._futures = set()
  142. self._lock = threading.Lock()
  143. logger.debug("Available workers: %s.", available_workers)
  144. def __enter__(self):
  145. """This method is not thread safe."""
  146. if self.closed:
  147. raise ComputingResourceManagerException("Can not reopen closed executor.")
  148. return self
  149. def __exit__(self, exc_type, exc_val, exc_tb):
  150. """This method is not thread safe."""
  151. self._close()
  152. def submit(self, *args, **kwargs):
  153. """
  154. Submit task.
  155. See concurrent.futures.Executor.submit() for details. This method is not thread safe.
  156. """
  157. logger.debug("Task submitted to executor %s.", self._id)
  158. if self.closed:
  159. raise ComputingResourceManagerException("Cannot submit task to a closed executor.")
  160. # Thread will wait on acquire().
  161. self._slots.acquire()
  162. future = self._mgr.submit(*args, **kwargs)
  163. # set.add is atomic in c-python.
  164. self._futures.add(future)
  165. return WrappedFuture(self, future)
  166. def release_slot(self):
  167. """
  168. Release a slot for new tasks to be submitted.
  169. Semaphore is itself thread safe, so no lock is needed.
  170. This method should only be called by ExecutorFuture.
  171. """
  172. self._slots.release()
  173. def remove_done_future(self, future):
  174. """
  175. Remove done futures so the executor will not track them.
  176. This method should only be called by WrappedFuture.
  177. """
  178. # set.remove is atomic in c-python so no lock is needed.
  179. self._futures.remove(future)
  180. @staticmethod
  181. def _calc_effective_workers(available_workers):
  182. return 1 if available_workers <= 1 else math.floor(available_workers)
  183. def _close(self):
  184. self.closed = True
  185. logger.debug("Executor is being closed, futures to wait: %s", self._futures)
  186. futures.wait(self._futures)
  187. logger.debug("Executor wait futures completed.")
  188. self._mgr.destroy_executor(self._id)
  189. logger.debug("Executor is closed.")
  190. @property
  191. def available_workers(self):
  192. """Get available workers."""
  193. with self._lock:
  194. return self._available_workers
  195. def add_worker(self, added_available_workers):
  196. """This method should only be called by ComputeResourceManager."""
  197. logger.debug("Add worker: %s", added_available_workers)
  198. with self._lock:
  199. self._available_workers += added_available_workers
  200. new_effective_workers = self._calc_effective_workers(self._available_workers)
  201. if new_effective_workers > self._effective_workers:
  202. for _ in range(new_effective_workers - self._effective_workers):
  203. self._slots.release()
  204. self._effective_workers = new_effective_workers
  205. def wait_all_tasks_finish(self):
  206. """
  207. Wait all tasks finish.
  208. This method is not thread safe.
  209. """
  210. futures.wait(self._futures)