|
- # Copyright 2020 Huawei Technologies Co., Ltd
- #
- # Licensed under the Apache License, Version 2.0 (the "License");
- # you may not use this file except in compliance with the License.
- # You may obtain a copy of the License at
- #
- # http://www.apache.org/licenses/LICENSE-2.0
- #
- # Unless required by applicable law or agreed to in writing, software
- # distributed under the License is distributed on an "AS IS" BASIS,
- # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
- # See the License for the specific language governing permissions and
- # limitations under the License.
- # ============================================================================
- """Data process analyser."""
- import os
-
- from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
-
-
- class MinddataAnalyser(BaseAnalyser):
- """The Minddata profiling analyser."""
-
- DEVICE_QUEUE_EMPTY_WARNING_THRESHOLD = 0.7
- DEVICE_QUEUE_NOT_EMPTY_THRESHOLD = 0.95
-
- def analyse_get_next_info(self, info_type="all"):
- """
- Analyse the get_next operation info.
-
- Args:
- info_type (str): The info type to return, default return both queue and time info,
- other options are ["queue", "time"].
-
- Returns:
- list[list], all get_next operation info, each info contains node_name, start, end, queue_size.
- """
- # init queue info result
- queue_info = dict()
- queue_size_list = []
- empty_step_count = 0
-
- # init time info result
- time_info = dict()
- time_list = []
- total_cost = 0
-
- file_name = "minddata_aicpu_" + self._device_id + ".txt"
- file_path = MinddataAnalyser.find_target_file(self._profiling_dir, file_name)
-
- if file_path:
- with open(file_path) as data_file:
- for line in data_file.readlines():
- node_info = line.split()
- if node_info and node_info[0] == "GetNext_dequeue_wait":
- # analyse target info type
- if len(node_info) > 3 and info_type in ["all", "queue"]:
- queue_size_list.append(int(node_info[3]))
- if node_info[3] == '0':
- empty_step_count += 1
- if len(node_info) > 2 and info_type in ["all", "time"]:
- one_step_cost_time = (float(node_info[2]) - float(node_info[1]))/1e3
- time_list.append(one_step_cost_time)
- total_cost += one_step_cost_time
- if info_type in ["all", "queue"]:
- queue_info["size"] = len(queue_size_list)
- queue_info["info"] = {"queue": queue_size_list}
- queue_info["summary"] = {
- "queue_summary": {
- "empty_queue": empty_step_count
- }
- }
- if len(node_info) > 2 and info_type in ["all", "time"]:
- time_info["size"] = len(time_list)
- time_info["info"] = {"get_next": time_list}
- if time_info["size"]:
- time_info["summary"] = {
- "time_summary": {
- "avg_cost": "0" if not time_list else str(total_cost / len(time_list))
- }
- }
-
- return queue_info, time_info
-
- def analyse_device_queue_info(self, info_type="all"):
- """
- Analyse the device_queue operation info.
-
- Args:
- info_type (str): The info type to return, default return both queue and time info,
- other options are ["queue", "time"].
-
- Returns:
- dict, queue size info.
- dict, time cost info.
- """
- # init queue info result
- queue_info = dict()
- get_time_list, push_time_list, total_time_list = [], [], []
- total_cost, total_push, total_get = 0, 0, 0
-
- # init time info result
- time_info = dict()
- queue_size_list = []
- empty_step, full_step = 0, 0
-
- file_path = self.get_device_queue_file_path()
-
- if file_path:
- with open(file_path) as data_file:
- for line in data_file.readlines():
- op_info = line.split()
- # time info
- if op_info and op_info[0] == "0" and info_type in ["all", "time"]:
- # sub_type: 0 get_time, 1 push time, 2 total time
- # op_info: 2: step num 3: cost time
- if op_info[1] == "0":
- get_time_list.append([int(op_info[2]), float(op_info[3])])
- total_get += float(op_info[3])
- elif op_info[1] == "1":
- push_time_list.append([int(op_info[2]), float(op_info[3])])
- total_push += float(op_info[3])
- elif op_info[1] == "2":
- total_time_list.append([int(op_info[2]), float(op_info[3])])
- total_cost += float(op_info[3])
- elif op_info and op_info[0] == "1" and info_type in ["all", "queue"]:
- queue_size_list.append([int(op_info[2]), int(op_info[3])])
- if op_info[1] == op_info[3]:
- full_step += 1
- if op_info[3] == "0":
- empty_step += 1
- if info_type in ["all", "time"]:
- total_time_list = MinddataAnalyser.sort_step(total_time_list)
- push_time_list = MinddataAnalyser.sort_step(push_time_list)
- get_time_list = MinddataAnalyser.sort_step(get_time_list)
-
- time_info["size"] = len(total_time_list)
- time_info["info"] = {"total_cost": total_time_list,
- "push_cost": push_time_list,
- "get_cost": get_time_list}
- if time_info["size"]:
- time_info["summary"] = {"time_summary": {"avg_cost": total_cost/time_info["size"]}}
- time_info["summary"]["time_summary"]["get_cost"] = total_get/time_info["size"]
- time_info["summary"]["time_summary"]["push_cost"] = total_push/time_info["size"]
-
- if info_type in ["all", "queue"]:
- queue_size_list = MinddataAnalyser.sort_step(queue_size_list)
-
- queue_info["size"] = len(queue_size_list)
- queue_info["info"] = {"queue": queue_size_list}
- queue_info["summary"] = {"queue_summary": {"empty_queue": empty_step}}
- queue_info["summary"]["queue_summary"]["full_queue"] = full_step
-
- return queue_info, time_info
-
- def get_device_queue_file_path(self):
- """
- Get device queue file path.
-
- Returns:
- str, the file path.
- """
- device_queue_file_name = "device_queue_profiling_" + self._device_id + ".txt"
- device_queue_file_path = MinddataAnalyser.find_target_file(self._profiling_dir, device_queue_file_name)
- feed_file_name = "dataset_iterator_profiling_" + self._device_id + ".txt"
- feed_file_path = MinddataAnalyser.find_target_file(self._profiling_dir, feed_file_name)
- file_path = ""
- if device_queue_file_path:
- file_path = device_queue_file_path
- elif not device_queue_file_path and feed_file_path:
- file_path = feed_file_path
-
- return file_path
-
- @staticmethod
- def analyse_queue_summary(get_next_queue_info, device_queue_info):
- """
- Analyse the queue summary info.
-
- Args:
- get_next_queue_info (dict): the get_next queue info return by ananlyser.
- device_queue_info (dict): the device queue info return by ananlyser.
-
- Returns:
- dict, the summary of queue.
- """
- result = {}
- if get_next_queue_info and device_queue_info:
- result = {"data_process": {"status": "normal"},
- "device_queue_op": {"status": "normal"},
- "tdt": {"status": "normal"},
- "get_next": {"status": "normal"}}
-
- get_next_queue_empty_count = get_next_queue_info.get(
- "summary", {}).get("queue_summary", {}).get("empty_queue", 0)
- result["get_next_queue_info"] = {
- "summary": {
- "empty_batch_count": get_next_queue_empty_count,
- "total_batch": get_next_queue_info.get("size")
- }
- }
-
- device_queue_empty_count = device_queue_info.get(
- "summary", {}).get("queue_summary", {}).get("empty_queue", 0)
- device_queue_full_count = device_queue_info.get(
- "summary", {}).get("queue_summary", {}).get("full_queue", 0)
-
- result["device_queue_info"] = {"summary": {
- "empty_batch_count": device_queue_empty_count,
- "full_batch_count": device_queue_full_count,
- "total_batch": device_queue_info.get("size")}}
-
- if get_next_queue_empty_count:
- if device_queue_empty_count > device_queue_info.get("size", 0)*\
- MinddataAnalyser.DEVICE_QUEUE_EMPTY_WARNING_THRESHOLD:
- result["data_process"]["status"] = "warning"
- elif device_queue_empty_count < device_queue_info.get("size", 0)*\
- MinddataAnalyser.DEVICE_QUEUE_NOT_EMPTY_THRESHOLD:
- result["tdt"]["status"] = "warning"
- result["device_queue_op"]["status"] = "warning"
-
- elif device_queue_info and not get_next_queue_info:
- result = {"data_process": {"status": "normal"},
- "fpbp": {"status": "normal"}}
-
- device_queue_empty_count = device_queue_info.get(
- "summary", {}).get("queue_summary", {}).get("empty_queue", 0)
- device_queue_full_count = device_queue_info.get(
- "summary", {}).get("queue_summary", {}).get("full_queue", 0)
-
- result["device_queue_info"] = {
- "summary": {
- "empty_batch_count": device_queue_empty_count,
- "full_batch_count": device_queue_full_count,
- "total_batch": device_queue_info.get("size")
- }
- }
-
- if device_queue_empty_count > device_queue_info.get("size", 0)*0.7:
- result["data_process"]["status"] = "warning"
-
- return result
-
-
- @staticmethod
- def sort_step(step_info_list):
- """
- Sorting the list by the first item and return the list of second item.
-
- Args:
- step_info_list (list): the step info, contains [step_num, info].
-
- Returns:
- list, the info list sorted by step.
- """
- step_info_list.sort(key=lambda x: x[0])
- result = []
- for item in step_info_list:
- result.append(item[1])
- return result
-
- @staticmethod
- def find_target_file(file_dir, file_name):
- """
- Find the target file in dir, and return the find file's abs path or "".
-
- Args:
- file_dir (str): The target file dir.
- file_name (str): The target file name.
-
- Returns:
- str, the abs file path.
- """
- target_file_path = ""
- for root_path, _, file_names in os.walk(file_dir):
- for item in file_names:
- if item == file_name:
- target_file_path = os.path.join(root_path, file_name)
-
- return target_file_path
-
- def _filter(self, filter_condition):
- """
- Filter the profiling data according to the filter condition.
-
- Args:
- filter_condition (dict): The filter condition.
- """
-
- def _load(self):
- """Load data according to the parsed profiling files."""
|