Browse Source

!1238 Fix minddata_cpu_utilization bugs

From: @zyhstack
Reviewed-by: 
Signed-off-by:
tags/v1.2.0-rc1
mindspore-ci-bot Gitee 4 years ago
parent
commit
a9aed71854
3 changed files with 87 additions and 61 deletions
  1. +6
    -2
      mindinsight/datavisual/data_transform/summary_watcher.py
  2. +68
    -50
      mindinsight/profiler/analyser/minddata_cpu_utilization_analyser.py
  3. +13
    -9
      mindinsight/profiler/proposer/allproposers/minddata_proposer.py

+ 6
- 2
mindinsight/datavisual/data_transform/summary_watcher.py View File

@@ -85,6 +85,9 @@ class SummaryWatcher:
logger.error('Path of summary base directory is not accessible.') logger.error('Path of summary base directory is not accessible.')
raise FileSystemPermissionError('Path of summary base directory is not accessible.') raise FileSystemPermissionError('Path of summary base directory is not accessible.')
# sort in ascending order according to modification time.
entries = [entry for entry in entries if not entry.is_symlink()]
entries = sorted(entries, key=lambda x: x.stat().st_mtime)
for entry in entries: for entry in entries:
if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT:
break break
@@ -127,7 +130,6 @@ class SummaryWatcher:
entry_name (str): Name of entry. entry_name (str): Name of entry.
counter (Counter): An instance of CountLimiter. counter (Counter): An instance of CountLimiter.
list_explain (bool): Indicates whether to list only the mindexplain folder. list_explain (bool): Indicates whether to list only the mindexplain folder.
""" """
try: try:
subdir_entries = os.scandir(entry_path) subdir_entries = os.scandir(entry_path)
@@ -135,6 +137,9 @@ class SummaryWatcher:
logger.warning('Path of %s under summary base directory is not accessible.', entry_name) logger.warning('Path of %s under summary base directory is not accessible.', entry_name)
return return
# sort in ascending order according to modification time.
subdir_entries = [subdir_entry for subdir_entry in subdir_entries if not subdir_entry.is_symlink()]
subdir_entries = sorted(subdir_entries, key=lambda x: x.stat().st_mtime)
for subdir_entry in subdir_entries: for subdir_entry in subdir_entries:
if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT: if len(summary_dict) == self.MAX_SUMMARY_DIR_COUNT:
break break
@@ -200,7 +205,6 @@ class SummaryWatcher:
ctime = datetime.datetime.fromtimestamp(stat.st_ctime).astimezone() ctime = datetime.datetime.fromtimestamp(stat.st_ctime).astimezone()
mtime = datetime.datetime.fromtimestamp(stat.st_mtime).astimezone() mtime = datetime.datetime.fromtimestamp(stat.st_mtime).astimezone()
if entry.is_file(): if entry.is_file():
summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name) summary_pattern = re.search(self.SUMMARY_FILENAME_REGEX, entry.name)
pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name) pb_pattern = re.search(self.PB_FILENAME_REGEX, entry.name)


+ 68
- 50
mindinsight/profiler/analyser/minddata_cpu_utilization_analyser.py View File

@@ -31,6 +31,16 @@ class MinddataCpuUtilizationAnalyser(BaseAnalyser):
def __init__(self, profiling_dir, device_id): def __init__(self, profiling_dir, device_id):
super().__init__(profiling_dir, device_id) super().__init__(profiling_dir, device_id)
self._steps_info = self._get_minddata_cpu_utilization_steps_info() self._steps_info = self._get_minddata_cpu_utilization_steps_info()
self._cpu_utilization_info = dict()

def get_idle_utilization_avg(self):
"""Get the idle utilization information of the whole machine."""
filter_condition = {}
self._filter(filter_condition)
device_key_value = "device_info"
self._get_cpu_utilization_average_value(device_key_value)
idle_utilization_avg = self._cpu_utilization_info.get("device_info").get("idle_utilization").get("avg_value")
return idle_utilization_avg


def query(self, condition=None): def query(self, condition=None):
""" """
@@ -48,50 +58,19 @@ class MinddataCpuUtilizationAnalyser(BaseAnalyser):
filter_condition = condition.get('filter_condition', {}) filter_condition = condition.get('filter_condition', {})
log.info("Receive query request. %s", filter_condition) log.info("Receive query request. %s", filter_condition)
self._filter(filter_condition) self._filter(filter_condition)
result = dict()
result["sampling_interval"] = self._data.get("sampling_interval")
result["step_info"] = self._steps_info
result["step_total_num"] = self._step_total_num
result["cpu_processor_num"] = self._data.get("cpu_processor_num")
self._cpu_utilization_info["sampling_interval"] = self._data.get("sampling_interval")
self._cpu_utilization_info["step_info"] = self._steps_info
self._cpu_utilization_info["step_total_num"] = self._step_total_num
self._cpu_utilization_info["cpu_processor_num"] = self._data.get("cpu_processor_num")
# device average CPU utilization # device average CPU utilization
result["device_info"] = dict()
for key in self._data.get("device_info").keys():
arr = self._data.get("device_info")[key]
avg_value = round(sum(arr) / len(arr)) if arr else 0
result["device_info"][key] = {"metrics": arr, "avg_value": avg_value}

device_key_value = "device_info"
self._get_cpu_utilization_average_value(device_key_value)
# process average CPU utilization # process average CPU utilization
result["process_info"] = dict()
for key in self._data.get("process_info").keys():
arr = self._data.get("process_info")[key]
avg_value = round(sum(arr) / len(arr)) if arr else 0
result["process_info"][key] = {"metrics": arr, "avg_value": avg_value}

process_key_value = "process_info"
self._get_cpu_utilization_average_value(process_key_value)
# op average CPU utilization # op average CPU utilization
minddata_pipeline_op_info = self._get_minddata_pipeline_info()
result["op_info"] = dict()
result["op_info"]["op_list"] = []
result["op_info"]["total_op_avg_value"] = dict()
result["op_info"]["total_op_avg_value"]["user_utilization"] = 0
result["op_info"]["total_op_avg_value"]["sys_utilization"] = 0

for item in self._data.get("op_info"):
# Filtering out non minddata pipeline operator
if str(item.get("op_id")) == "-1":
continue
op_info_dict = dict()
op_info_dict["metrics"] = dict()
for key in item.get("metrics").keys():
arr = item.get("metrics")[key]
avg_value = round(sum(arr) / len(arr)) if arr else 0
op_info_dict["metrics"][key] = {"metrics": arr, "avg_value": avg_value}
result["op_info"]["total_op_avg_value"][key] += avg_value
op_info_dict["op_id"] = item.get("op_id")
op_info = [i for i in minddata_pipeline_op_info if i.get("op_id") == item.get("op_id")]
# op_info is like [{"num_workers":int,"op_id":int}]
op_info_dict["num_workers"] = op_info[0].get("num_workers")
result["op_info"]["op_list"].append(op_info_dict)
return result
self._get_cpu_utilization_op_average_value()
return self._cpu_utilization_info


def _load(self): def _load(self):
"""Load cpu_utilization info.""" """Load cpu_utilization info."""
@@ -101,7 +80,7 @@ class MinddataCpuUtilizationAnalyser(BaseAnalyser):
file_path, raise_key="Invalid cpu_utilization_info file path.") file_path, raise_key="Invalid cpu_utilization_info file path.")
if not os.path.exists(file_path): if not os.path.exists(file_path):
log.error('Did not find the cpu utilization file: %s', file_path) log.error('Did not find the cpu utilization file: %s', file_path)
raise ProfilerFileNotFoundException(msg='Did not find the cpu utilization file:{}'.format(file_path))
raise ProfilerFileNotFoundException(msg='Did not find the cpu utilization file.')


with open(file_path, 'r', encoding='utf-8') as file: with open(file_path, 'r', encoding='utf-8') as file:
try: try:
@@ -124,8 +103,6 @@ class MinddataCpuUtilizationAnalyser(BaseAnalyser):
""" """
start_step = filter_condition.get("start_step", 1) start_step = filter_condition.get("start_step", 1)
end_step = filter_condition.get("end_step", self._step_total_num) end_step = filter_condition.get("end_step", self._step_total_num)
if start_step == 1 and end_step == self._step_total_num:
return
while not self._steps_info.count(str(start_step)): while not self._steps_info.count(str(start_step)):
start_step += 1 start_step += 1
left_index = self._steps_info.index(str(start_step)) left_index = self._steps_info.index(str(start_step))
@@ -149,17 +126,22 @@ class MinddataCpuUtilizationAnalyser(BaseAnalyser):
item["metrics"][key] = item.get("metrics").get(key)[left_index:right_index + 1] item["metrics"][key] = item.get("metrics").get(key)[left_index:right_index + 1]


def _get_minddata_cpu_utilization_steps_info(self): def _get_minddata_cpu_utilization_steps_info(self):
"""Establish a connection between cpu utilization sampling points and host queue capacity"""
"""Establish a connection between cpu utilization sampling points and host queue capacity."""
steps_info = [] steps_info = []
left_index = 0 left_index = 0
right_index = 0 right_index = 0
time_stamp = self._data.get("time_stamp") time_stamp = self._data.get("time_stamp")
queue_step_time_info = self._get_minddata_queue_step_time_info() queue_step_time_info = self._get_minddata_queue_step_time_info()
self._step_total_num = len(queue_step_time_info) self._step_total_num = len(queue_step_time_info)
step0 = 0
for item in time_stamp: for item in time_stamp:
# queue_step_time_info[][0]:step_num
# queue_step_time_info[][1]:sample time
# points less than step1 are classified as step0
if float(item) < float(queue_step_time_info[0][1]):
steps_info.append(step0)
continue
while right_index < len(queue_step_time_info): while right_index < len(queue_step_time_info):
# queue_step_time_info[][0]:step_num
# queue_step_time_info[][1]:sample time
if float(item) <= float(queue_step_time_info[right_index][1]): if float(item) <= float(queue_step_time_info[right_index][1]):
if float(item) < float(queue_step_time_info[right_index][1]): if float(item) < float(queue_step_time_info[right_index][1]):
steps_info.append(queue_step_time_info[left_index][0]) steps_info.append(queue_step_time_info[left_index][0])
@@ -183,13 +165,16 @@ class MinddataCpuUtilizationAnalyser(BaseAnalyser):
file_path, raise_key="Invalid device_queue file path") file_path, raise_key="Invalid device_queue file path")
if not os.path.exists(file_path): if not os.path.exists(file_path):
log.error('Did not find the device queue file: %s', file_path) log.error('Did not find the device queue file: %s', file_path)
raise ProfilerFileNotFoundException(msg='Did not find the device queue file:{}'.format(file_path))
raise ProfilerFileNotFoundException(msg='Did not find the device queue file.')


with open(file_path) as data_file: with open(file_path) as data_file:
for line in data_file.readlines(): for line in data_file.readlines():
op_info = line.split() op_info = line.split()
# op_info[0]=="1":queue info, op_info[1]:Connector capacity,
# op_info[2]:step_num, op_info[3]:Connector size, op_info[4]:sampling time
# op_info is a list like:['1','64','8','2','85406783']
# The value of the first element in op_info is '0' or '1'.
# '0' means that the time information is recorded.
# '1' means that the queue information is recorded.
# '1':queue info , '64':queue capacity, '8':step_num, '2':queue size, '85406783':sampling time.
if op_info and op_info[0] == "1": if op_info and op_info[0] == "1":
minddata_queue_step_time_info.append([op_info[2], op_info[4]]) minddata_queue_step_time_info.append([op_info[2], op_info[4]])
return minddata_queue_step_time_info return minddata_queue_step_time_info
@@ -218,3 +203,36 @@ class MinddataCpuUtilizationAnalyser(BaseAnalyser):
op_info_dict["num_workers"] = item.get("num_workers") op_info_dict["num_workers"] = item.get("num_workers")
minddata_pipeline_op_info.append(op_info_dict) minddata_pipeline_op_info.append(op_info_dict)
return minddata_pipeline_op_info return minddata_pipeline_op_info

def _get_cpu_utilization_average_value(self, key_value):
"""Get cpu_utilization average value for host or process."""
self._cpu_utilization_info[key_value] = dict()
for key in self._data.get(key_value).keys():
arr = self._data.get(key_value)[key]
avg_value = round(sum(arr) / len(arr)) if arr else 0
self._cpu_utilization_info[key_value][key] = {"metrics": arr, "avg_value": avg_value}

def _get_cpu_utilization_op_average_value(self):
"""Get cpu_utilization average value for op."""
minddata_pipeline_op_info = self._get_minddata_pipeline_info()
self._cpu_utilization_info["op_info"] = {
"op_list": [],
"total_op_avg_value": {"user_utilization": 0, "sys_utilization": 0}
}

for item in self._data.get("op_info"):
# Filtering out non minddata pipeline operator
if str(item.get("op_id")) == "-1":
continue
op_info_dict = dict()
op_info_dict["metrics"] = dict()
for key in item.get("metrics").keys():
arr = item.get("metrics")[key]
avg_value = round(sum(arr) / len(arr)) if arr else 0
op_info_dict["metrics"][key] = {"metrics": arr, "avg_value": avg_value}
self._cpu_utilization_info["op_info"]["total_op_avg_value"][key] += avg_value
op_info_dict["op_id"] = item.get("op_id")
op_info = [i for i in minddata_pipeline_op_info if i.get("op_id") == item.get("op_id")]
# op_info is like [{"num_workers":int,"op_id":int}]
op_info_dict["num_workers"] = op_info[0].get("num_workers")
self._cpu_utilization_info["op_info"]["op_list"].append(op_info_dict)

+ 13
- 9
mindinsight/profiler/proposer/allproposers/minddata_proposer.py View File

@@ -1,4 +1,4 @@
# Copyright 2020 Huawei Technologies Co., Ltd
# Copyright 2020-2021 Huawei Technologies Co., Ltd
# #
# Licensed under the Apache License, Version 2.0 (the "License"); # Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License. # you may not use this file except in compliance with the License.
@@ -20,6 +20,8 @@ from collections import OrderedDict
from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory
from mindinsight.profiler.analyser.minddata_analyser import MinddataAnalyser from mindinsight.profiler.analyser.minddata_analyser import MinddataAnalyser
from mindinsight.profiler.proposer.allproposers.base_proposer import Proposer from mindinsight.profiler.proposer.allproposers.base_proposer import Proposer
from mindinsight.profiler.common.log import logger as log
from mindinsight.profiler.common.exceptions.exceptions import ProfilerRawFileException, ProfilerFileNotFoundException
class MinddataProposer(Proposer): class MinddataProposer(Proposer):
@@ -96,11 +98,13 @@ class MinddataProposer(Proposer):
minddata_cpu_utilization = OrderedDict() minddata_cpu_utilization = OrderedDict()
minddata_cpu_utilization_analyser = AnalyserFactory.instance().get_analyser( minddata_cpu_utilization_analyser = AnalyserFactory.instance().get_analyser(
'minddata_cpu_utilization', self.profiling_path, self.device_id) 'minddata_cpu_utilization', self.profiling_path, self.device_id)
result = minddata_cpu_utilization_analyser.query()
idle_utilization_avg = result.get("device_info").get("idle_utilization").get("avg_value")
# The maximum value of this cpu_activate_utilization_avg is 100%.
cpu_activate_utilization_avg = 100 - idle_utilization_avg
cpu_activate_utilization_threshold = 80
if cpu_activate_utilization_avg > cpu_activate_utilization_threshold:
minddata_cpu_utilization["minddata_cpu_utilization"] = [cpu_activate_utilization_avg]
self.__proposal_dict.update(minddata_cpu_utilization)
try:
idle_utilization_avg = minddata_cpu_utilization_analyser.get_idle_utilization_avg()
# The maximum value of this cpu_activate_utilization_avg is 100%.
cpu_activate_utilization_avg = 100 - idle_utilization_avg
cpu_activate_utilization_threshold = 80
if cpu_activate_utilization_avg > cpu_activate_utilization_threshold:
minddata_cpu_utilization["minddata_cpu_utilization"] = [cpu_activate_utilization_avg]
self.__proposal_dict.update(minddata_cpu_utilization)
except (ProfilerRawFileException, ProfilerFileNotFoundException) as err:
log.exception(err)

Loading…
Cancel
Save