You can not select more than 25 topics Topics must start with a chinese character,a letter or number, can include dashes ('-') and can be up to 35 characters long.

minddata_cpu_utilization_analyser.py 12 kB

5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
5 years ago
123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238
  1. # Copyright 2021 Huawei Technologies Co., Ltd
  2. #
  3. # Licensed under the Apache License, Version 2.0 (the "License");
  4. # you may not use this file except in compliance with the License.
  5. # You may obtain a copy of the License at
  6. #
  7. # http://www.apache.org/licenses/LICENSE-2.0
  8. #
  9. # Unless required by applicable law or agreed to in writing, software
  10. # distributed under the License is distributed on an "AS IS" BASIS,
  11. # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. # See the License for the specific language governing permissions and
  13. # limitations under the License.
  14. # ============================================================================
  15. """The MindDataCpuUtilizationAnalyser analyser class."""
  16. import json
  17. import os
  18. from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
  19. from mindinsight.profiler.common.exceptions.exceptions import ProfilerRawFileException, ProfilerFileNotFoundException
  20. from mindinsight.profiler.common.log import logger as log
  21. from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path
  22. from mindinsight.profiler.analyser.minddata_analyser import MinddataAnalyser
  23. class MinddataCpuUtilizationAnalyser(BaseAnalyser):
  24. """The analyser for analyzing minddata cpu utilization."""
  25. _cpu_utilization_display_filename = "minddata_cpu_utilization_{}.json"
  26. _minddata_pipeline_display_filename = "pipeline_profiling_{}.json"
  27. def __init__(self, profiling_dir, device_id):
  28. super().__init__(profiling_dir, device_id)
  29. self._steps_info = self._get_minddata_cpu_utilization_steps_info()
  30. self._cpu_utilization_info = dict()
  31. def get_idle_utilization_avg(self):
  32. """Get the idle utilization information of the whole machine."""
  33. filter_condition = {}
  34. self._filter(filter_condition)
  35. device_key_value = "device_info"
  36. self._get_cpu_utilization_average_value(device_key_value)
  37. idle_utilization_avg = self._cpu_utilization_info.get("device_info").get("idle_utilization").get("avg_value")
  38. return idle_utilization_avg
  39. def query(self, condition=None):
  40. """
  41. Query data according to the condition.
  42. Args:
  43. condition (dict): The search condition, only contains `filter_condition` parameter.
  44. Default: None.
  45. Returns:
  46. dict, the result after filtered, sorted and grouped.
  47. """
  48. if condition is None:
  49. condition = {}
  50. filter_condition = condition.get('filter_condition', {})
  51. log.info("Receive query request. %s", filter_condition)
  52. self._filter(filter_condition)
  53. self._cpu_utilization_info["sampling_interval"] = self._data.get("sampling_interval")
  54. self._cpu_utilization_info["step_info"] = self._steps_info
  55. self._cpu_utilization_info["step_total_num"] = self._step_total_num
  56. self._cpu_utilization_info["cpu_processor_num"] = self._data.get("cpu_processor_num")
  57. # device average CPU utilization
  58. device_key_value = "device_info"
  59. self._get_cpu_utilization_average_value(device_key_value)
  60. # process average CPU utilization
  61. process_key_value = "process_info"
  62. self._get_cpu_utilization_average_value(process_key_value)
  63. # op average CPU utilization
  64. self._get_cpu_utilization_op_average_value()
  65. return self._cpu_utilization_info
  66. def _load(self):
  67. """Load cpu_utilization info."""
  68. file_name = self._cpu_utilization_display_filename.format(self._device_id)
  69. file_path = os.path.join(self._profiling_dir, file_name)
  70. file_path = validate_and_normalize_path(
  71. file_path, raise_key="Invalid cpu_utilization_info file path.")
  72. if not os.path.exists(file_path):
  73. log.error('Did not find the cpu utilization file: %s', file_path)
  74. raise ProfilerFileNotFoundException(msg='Did not find the cpu utilization file.')
  75. with open(file_path, 'r', encoding='utf-8') as file:
  76. try:
  77. self._data = json.load(file)
  78. except json.JSONDecodeError as err:
  79. log.exception(err)
  80. raise ProfilerRawFileException("Fail to parse cpu_utilization info file")
  81. def _filter(self, filter_condition):
  82. """
  83. Filter the profiling data according to the filter condition.
  84. Args:
  85. filter_condition (dict): The filter condition.
  86. - start_step_id (int): The selected start step id.
  87. - end_step_id (int): The selected end step id.
  88. """
  89. start_step = filter_condition.get("start_step", 1)
  90. end_step = filter_condition.get("end_step", self._step_total_num)
  91. while not self._steps_info.count(str(start_step)):
  92. start_step += 1
  93. left_index = self._steps_info.index(str(start_step))
  94. while not self._steps_info.count(str(end_step)):
  95. end_step -= 1
  96. right_index = self._steps_info.index(str(end_step)) + \
  97. self._steps_info.count(str(end_step)) - 1
  98. self._steps_info = self._steps_info[left_index:right_index + 1]
  99. # filter device CPU utilization
  100. for key in self._data.get("device_info").keys():
  101. self._data["device_info"][key] = \
  102. self._data.get("device_info").get(key)[left_index:right_index + 1]
  103. # filter process CPU utilization
  104. for key in self._data.get("process_info").keys():
  105. self._data["process_info"][key] = self._data.get("process_info").get(key)[left_index:right_index + 1]
  106. # filter op CPU utilization
  107. for item in self._data.get("op_info"):
  108. for key in item.get("metrics").keys():
  109. item["metrics"][key] = item.get("metrics").get(key)[left_index:right_index + 1]
  110. def _get_minddata_cpu_utilization_steps_info(self):
  111. """Establish a connection between cpu utilization sampling points and host queue capacity."""
  112. steps_info = []
  113. left_index = 0
  114. right_index = 0
  115. time_stamp = self._data.get("time_stamp")
  116. queue_step_time_info = self._get_minddata_queue_step_time_info()
  117. self._step_total_num = len(queue_step_time_info)
  118. step0 = 0
  119. for item in time_stamp:
  120. # queue_step_time_info[][0]:step_num
  121. # queue_step_time_info[][1]:sample time
  122. # points less than step1 are classified as step0
  123. if float(item) < float(queue_step_time_info[0][1]):
  124. steps_info.append(step0)
  125. continue
  126. while right_index < len(queue_step_time_info):
  127. if float(item) <= float(queue_step_time_info[right_index][1]):
  128. if float(item) < float(queue_step_time_info[right_index][1]):
  129. steps_info.append(queue_step_time_info[left_index][0])
  130. else:
  131. steps_info.append(queue_step_time_info[right_index][0])
  132. break
  133. left_index = right_index
  134. right_index += 1
  135. if right_index == len(queue_step_time_info):
  136. steps_info.append(queue_step_time_info[right_index - 1][0])
  137. return steps_info
  138. def _get_minddata_queue_step_time_info(self):
  139. """Get the sampling time information at the steps of the host queue"""
  140. minddata_queue_step_time_info = []
  141. minddata_analyser = MinddataAnalyser(self._profiling_dir, self._device_id)
  142. file_path = minddata_analyser.get_device_queue_file_path()
  143. file_path = validate_and_normalize_path(
  144. file_path, raise_key="Invalid device_queue file path")
  145. if not os.path.exists(file_path):
  146. log.error('Did not find the device queue file: %s', file_path)
  147. raise ProfilerFileNotFoundException(msg='Did not find the device queue file.')
  148. with open(file_path) as data_file:
  149. for line in data_file.readlines():
  150. op_info = line.split()
  151. # op_info is a list like:['1','64','8','2','85406783']
  152. # The value of the first element in op_info is '0' or '1'.
  153. # '0' means that the time information is recorded.
  154. # '1' means that the queue information is recorded.
  155. # '1':queue info , '64':queue capacity, '8':step_num, '2':queue size, '85406783':sampling time.
  156. if op_info and op_info[0] == "1":
  157. minddata_queue_step_time_info.append([op_info[2], op_info[4]])
  158. return minddata_queue_step_time_info
  159. def _get_minddata_pipeline_info(self):
  160. """Get the number of thread cores in minddata pipeline operator"""
  161. file_name = self._minddata_pipeline_display_filename.format(self._device_id)
  162. file_path = os.path.join(self._profiling_dir, file_name)
  163. file_path = validate_and_normalize_path(
  164. file_path, raise_key="Invalid minddata_pipeline_info file path.")
  165. if not os.path.exists(file_path):
  166. log.error('Did not find the minddata_pipeline file: %s', file_path)
  167. raise ProfilerFileNotFoundException(msg='Did not find the minddata_pipeline file:{}'.format(file_path))
  168. with open(file_path, 'r', encoding='utf-8') as file:
  169. try:
  170. minddata_pipeline_info = json.load(file)
  171. except json.JSONDecodeError as err:
  172. log.exception(err)
  173. raise ProfilerRawFileException("Fail to parse minddata pipeline file")
  174. minddata_pipeline_op_info = []
  175. for item in minddata_pipeline_info.get("op_info"):
  176. op_info_dict = dict()
  177. op_info_dict["op_id"] = item.get("op_id")
  178. op_info_dict["num_workers"] = item.get("num_workers")
  179. minddata_pipeline_op_info.append(op_info_dict)
  180. return minddata_pipeline_op_info
  181. def _get_cpu_utilization_average_value(self, key_value):
  182. """Get cpu_utilization average value for host or process."""
  183. self._cpu_utilization_info[key_value] = dict()
  184. for key in self._data.get(key_value).keys():
  185. arr = self._data.get(key_value)[key]
  186. avg_value = round(sum(arr) / len(arr)) if arr else 0
  187. self._cpu_utilization_info[key_value][key] = {"metrics": arr, "avg_value": avg_value}
  188. def _get_cpu_utilization_op_average_value(self):
  189. """Get cpu_utilization average value for op."""
  190. minddata_pipeline_op_info = self._get_minddata_pipeline_info()
  191. self._cpu_utilization_info["op_info"] = {
  192. "op_list": [],
  193. "total_op_avg_value": {"user_utilization": 0, "sys_utilization": 0}
  194. }
  195. for item in self._data.get("op_info"):
  196. # Filtering out non minddata pipeline operator
  197. if str(item.get("op_id")) == "-1":
  198. continue
  199. op_info_dict = dict()
  200. op_info_dict["metrics"] = dict()
  201. for key in item.get("metrics").keys():
  202. arr = item.get("metrics")[key]
  203. avg_value = round(sum(arr) / len(arr)) if arr else 0
  204. op_info_dict["metrics"][key] = {"metrics": arr, "avg_value": avg_value}
  205. self._cpu_utilization_info["op_info"]["total_op_avg_value"][key] += avg_value
  206. op_info_dict["op_id"] = item.get("op_id")
  207. op_info = [i for i in minddata_pipeline_op_info if i.get("op_id") == item.get("op_id")]
  208. # op_info is like [{"num_workers":int,"op_id":int}]
  209. op_info_dict["num_workers"] = op_info[0].get("num_workers")
  210. self._cpu_utilization_info["op_info"]["op_list"].append(op_info_dict)