Browse Source

add st of profiler timeline

tags/v1.1.0
gzhcv 5 years ago
parent
commit
a525f61f68
6 changed files with 123 additions and 280 deletions
  1. +1
    -280
      mindinsight/profiler/analyser/timeline_analyser.py
  2. +118
    -0
      tests/st/func/profiler/test_timeline_analyser.py
  3. +1
    -0
      tests/utils/resource/profiler/ascend_timeline_display_0.json
  4. +1
    -0
      tests/utils/resource/profiler/ascend_timeline_summary_0.json
  5. +1
    -0
      tests/utils/resource/profiler/gpu_timeline_display_0.json
  6. +1
    -0
      tests/utils/resource/profiler/gpu_timeline_summary_0.json

+ 1
- 280
mindinsight/profiler/analyser/timeline_analyser.py View File

@@ -17,75 +17,20 @@ import json
import os

from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
ProfilerIOException
from mindinsight.profiler.common.exceptions.exceptions import ProfilerIOException
from mindinsight.profiler.common.log import logger
from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path
from mindinsight.utils.exceptions import ParamValueError

SIZE_LIMIT = 20 * 1024 * 1024 # 20MB


class TimelineContainer:
"""
A container of operator computation metadata.

Args:
split_list (list): The split list of metadata in op_compute output file.
"""
def __init__(self, split_list):
self._op_name = split_list[0]
self._stream_id = int(split_list[1])
self._start_time = float(split_list[2])
self._duration = float(split_list[3])
self._pid = None
if len(split_list) == 5:
self._pid = int(split_list[4])

@property
def op_name(self):
"""Get the name of the operator."""
return self._op_name

@property
def stream_id(self):
"""Get the stream id of the operator."""
return self._stream_id

@property
def start_time(self):
"""Get the execution start time of the operator."""
return self._start_time

@property
def duration(self):
"""Get the duration of the operator execution."""
return self._duration

@property
def pid(self):
"""Get the pid of the operator execution."""
return self._pid


class TimelineAnalyser(BaseAnalyser):
"""
Analyse timeline data from file.
"""
__col_names__ = ['op_name', 'stream_id', 'start_time', 'duration']
_output_timeline_data_file_path = 'output_timeline_data_{}.txt'
_min_cycle_counter_file_path = 'min_cycle_counter_{}.txt'
_ascend_display_filename = 'ascend_timeline_display_{}.json'
_gpu_display_filename = 'gpu_timeline_display_{}.json'
_ascend_timeline_summary_filename = 'ascend_timeline_summary_{}.json'
_gpu_timeline_summary_filename = 'gpu_timeline_summary_{}.json'
_timeline_meta = []
_timeline_summary = {
'total_time': 0,
'num_of_streams': 0,
'num_of_ops': 0,
'op_exe_times': 0
}

def _load(self):
"""Load data according to the parsed profiling files."""
@@ -161,227 +106,3 @@ class TimelineAnalyser(BaseAnalyser):
logger.info('No timeline summary file. Please check the output path.')

return timeline_summary

def write_timeline(self):
"""Load data according to the parsed profiling files."""
# Write timeline to file.
logger.info('Writing timeline file...')
self.write_timeline_to_json_by_limitation()
logger.info('Finished file writing!')

def write_timeline_to_json_by_limitation(self):
"""Write timeline to json by limitation."""
display_filename = self._display_filename.format(self._device_id)
display_file_path = os.path.join(
self._profiling_dir,
display_filename
)
display_file_path = validate_and_normalize_path(
display_file_path, raise_key='Invalid timeline display json path.'
)

length = len(self._timeline_meta)
try:
with open(display_file_path, 'w') as json_file:
json_file.write('[')
for index, item in enumerate(self._timeline_meta):
json.dump(item, json_file)
file_size = os.path.getsize(display_file_path)
if file_size > SIZE_LIMIT:
break
if index == length - 1:
break
json_file.write(',')
json_file.write(']')
except (IOError, OSError) as err:
logger.error('Error occurred when write timeline display file: %s', err)
raise ProfilerIOException

def write_timeline_summary(self):
"""Write timeline summary to json."""
timeline_summary_file_path = os.path.join(
self._profiling_dir,
self._timeline_summary_filename.format(self._device_id)
)

timeline_summary_file_path = validate_and_normalize_path(
timeline_summary_file_path, raise_key='Invalid timeline summary path.'
)

try:
with open(timeline_summary_file_path, 'w') as json_file:
json.dump(self._timeline_summary, json_file)
except (IOError, OSError) as err:
logger.error('Error occurred when write timeline summary file: %s', err)
raise ProfilerIOException

def _load_timeline_data(self):
"""Load timeline data from file."""
file_path = os.path.join(
self._profiling_dir,
self._output_timeline_data_file_path.format(self._device_id)
)
file_path = validate_and_normalize_path(
file_path, raise_key='Invalid timeline txt file path.'
)
if not os.path.exists(file_path):
logger.error("Failed to find parsed timeline file.")
raise ProfilerFileNotFoundException('parsed timeline file')

timeline_list = []
try:
with open(file_path, 'r') as f_obj:
for line in f_obj:
if not line.startswith('op_name'):
line_list = line.strip('\n').split(',')
timeline_list.append(line_list)
except (IOError, OSError) as err:
logger.error('Error occurred when read timeline intermediate file: %s', err)
raise ProfilerIOException

return timeline_list

def _parse_timeline_data(self, timeline, min_cycle_counter):
"""Parse timeline data."""
# factor to convert the time unit from 1ms to 1us for timeline display
factor = 1000
op_meta = TimelineContainer(timeline)
timeline_dict = {}
timeline_dict['name'] = op_meta.op_name
timeline_dict['ph'] = 'X'
timeline_dict['tid'] = op_meta.stream_id
timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) * factor
dur = op_meta.duration * factor
timeline_dict['dur'] = dur
if op_meta.pid is None:
timeline_dict['pid'] = int(self._device_id)
# Update total time of operator execution.
self._timeline_summary['total_time'] += dur
else: # AllReduce and AI CPU pid
timeline_dict['pid'] = op_meta.pid
self._timeline_meta.append(timeline_dict)

@staticmethod
def _update_num_of_streams(timeline, stream_count_dict):
"""Update number of streams."""
stream_id = timeline[1]
if stream_id not in stream_count_dict.keys():
stream_count_dict[stream_id] = 1
else:
stream_count_dict[stream_id] += 1

def get_min_cycle_counter(self):
"""
Get minimum cycle counter.

Returns:
float, the minimum value of the cycle counter.
"""
file_path = os.path.join(
self._profiling_dir,
self._min_cycle_counter_file_path.format(self._device_id)
)

file_path = validate_and_normalize_path(
file_path, raise_key='Invalid min cycle counter file path.'
)

if os.path.exists(file_path):
try:
with open(file_path, 'r') as f_obj:
min_cycle_counter = f_obj.read()
min_cycle_counter = float(min_cycle_counter) \
if not min_cycle_counter == 'inf' else 0
except (IOError, OSError) as err:
logger.error('Error occurred when read minimum cycle counter: %s', err)
raise ProfilerIOException
else:
min_cycle_counter = 0
logger.info("No min cycle counter recorded.")

return min_cycle_counter

def init_timeline(self, all_reduce_info, framework_info, aicpu_info, min_cycle_counter):
"""
Init timeline metadata, adding all collected info.

Args:
all_reduce_info (list[list]): The metadata of AllReduce operator.
framework_info (dict): The framework metadata.
aicpu_info (dict): The metadata of AI CPU operator.
min_cycle_counter (float): The minimum cycle counter of the timeline.
"""
if min_cycle_counter == float('inf'):
min_cycle_counter = 0

logger.info('Initiating timeline...')
timeline_list = self._load_timeline_data()
self._timeline_summary['op_exe_times'] = len(timeline_list)

# Add AllReduce info to timeline temp list and sort by start time.
if all_reduce_info:
logger.debug('AllReduce info found. Start adding info into timeline...')
timeline_list.extend(all_reduce_info)
timeline_list.sort(key=lambda x: float(x[2]))

# Add AI CPU data into timeline temp list and sort by start time.
aicpu_data = aicpu_info.get('info')
if aicpu_data:
timeline_list.extend(aicpu_data)
timeline_list.sort(key=lambda x: float(x[2]))
self._timeline_summary['op_exe_times'] += aicpu_info.get('op_exe_times', 0)
self._timeline_summary['num_of_streams'] += aicpu_info.get('num_of_streams', 0)
self._timeline_summary['num_of_ops'] += aicpu_info.get('num_of_ops', 0)
self._timeline_summary['total_time'] += aicpu_info.get('total_time', 0)

# Init a dict for counting the num of streams.
stream_count_dict = {}
for timeline in timeline_list:
self._parse_timeline_data(timeline, min_cycle_counter)
# Updating the collection of streams.
if len(timeline) == 4:
self._update_num_of_streams(timeline, stream_count_dict)

# Get framework metadata.
framework_obj_list = framework_info.get('object')
# The length of list is the number of operators.
self._timeline_summary['num_of_ops'] += len(framework_obj_list)
self._add_framework_info(framework_obj_list)
logger.info('Finished adding info into timeline...')

# Update timeline summary info
self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())

def _add_framework_info(self, framework_obj_list):
"""
Add framework info into timeline metadata.

Args:
framework_obj_list (list): The framework metadata.
"""
logger.debug('Start adding framework info into timeline...')
# Get the framework info that will be written into timeline.
framework_info_dict = {}
for framework_obj in framework_obj_list:
op_name = framework_obj[0]
op_type = framework_obj[1]
op_full_name = framework_obj[4]
op_info = framework_obj[5]
framework_info_dict[op_full_name] = {
'name': op_name,
'args': {
'type': op_type,
'fullname': op_full_name
}
}
framework_info_dict[op_full_name]['args'].update(op_info)

# Insert framework info into timeline.
for timeline_item in self._timeline_meta:
op_full_name = timeline_item.get('name')
framework_item = framework_info_dict.get(op_full_name)
if framework_item:
timeline_item['name'] = framework_item.get('name')
timeline_item['args'] = framework_item.get('args')

logger.debug('Finished adding framework info into timeline...')

+ 118
- 0
tests/st/func/profiler/test_timeline_analyser.py View File

@@ -0,0 +1,118 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""
Fuction:
Test profiler to watch the performance of training.
Usage:
pytest tests/st/func/profiler
"""
import os

import pytest

from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory
from . import PROFILER_DIR
from ....utils.tools import compare_result_with_file

class TestTimelineAnalyser:
"""Test timeline analyser module."""

@classmethod
def setup_class(cls):
"""Generate parsed files."""
cls.profiler = PROFILER_DIR
cls.device_id = 0
cls.ascend_display_filename = 'ascend_timeline_display_{}.json'
cls.gpu_display_filename = 'gpu_timeline_display_{}.json'
cls.ascend_timeline_summary_filename = 'ascend_timeline_summary_{}.json'
cls.gpu_timeline_summary_filename = 'gpu_timeline_summary_{}.json'

def setup_method(self):
"""Create analyser."""
self._analyser = AnalyserFactory.instance().get_analyser(
'timeline', self.profiler, self.device_id)


@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_get_display_timeline(self):
"""Test the function of get timeline data for UI display."""
gpu_file_path = os.path.join(
self.profiler,
self.gpu_display_filename.format(self.device_id)
)
ascend_file_path = os.path.join(
self.profiler,
self.ascend_display_filename.format(self.device_id)
)

result = self._analyser.get_display_timeline("gpu")
compare_result_with_file(result, gpu_file_path)

result = self._analyser.get_display_timeline("ascend")
compare_result_with_file(result, ascend_file_path)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_get_timeline_summary(self):
"""Test the function of get timeline data for UI display."""
gpu_file_path = os.path.join(
self.profiler,
self.gpu_timeline_summary_filename.format(self.device_id)
)
ascend_file_path = os.path.join(
self.profiler,
self.ascend_timeline_summary_filename.format(self.device_id)
)

result = self._analyser.get_timeline_summary("gpu")
compare_result_with_file(result, gpu_file_path)

result = self._analyser.get_timeline_summary("ascend")
compare_result_with_file(result, ascend_file_path)

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_timeline_summary_file_not_exist(self):
"""Test the function of get timeline data for UI display."""
device_id = 1
analyser = AnalyserFactory.instance().get_analyser(
'timeline', self.profiler, device_id)
analyser.get_timeline_summary("gpu")

@pytest.mark.level0
@pytest.mark.env_single
@pytest.mark.platform_x86_cpu
@pytest.mark.platform_arm_ascend_training
@pytest.mark.platform_x86_gpu_training
@pytest.mark.platform_x86_ascend_training
def test_timeline_display_file_not_exist(self):
"""Test the function of get timeline data for UI display."""
device_id = 1
analyser = AnalyserFactory.instance().get_analyser(
'timeline', self.profiler, device_id)
analyser.get_display_timeline("gpu")

+ 1
- 0
tests/utils/resource/profiler/ascend_timeline_display_0.json View File

@@ -0,0 +1 @@
[{"name": "InitData", "ph": "X", "tid": "0", "ts": 0.0, "dur": 159.9999964237213, "pid": 9000},{"name": "GetNext", "ph": "X", "tid": "0", "ts": 6042138.9999985695, "dur": 517.9999768733978, "pid": 9000},{"name": "AssignAdd-op142", "ph": "X", "tid": "519", "ts": 6042909.559994936, "dur": 3.1, "pid": 1, "args": {"type": "AssignAdd", "fullname": "Default/AssignAdd-op142", "input_0": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}, "input_1": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}, "output_0": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}}}]

+ 1
- 0
tests/utils/resource/profiler/ascend_timeline_summary_0.json View File

@@ -0,0 +1 @@
{"total_time": 23147.013999999992, "num_of_streams": 2, "num_of_ops": 213, "op_exe_times": 112}

+ 1
- 0
tests/utils/resource/profiler/gpu_timeline_display_0.json View File

@@ -0,0 +1 @@
[{"name": "Default/InitDataSetQueue-op0", "ph": "X", "tid": "Ops", "ts": 1600695893269618.2, "dur": 303.84, "pid": 0},{"name": "void MomentumUpdateVariableKernel<float, float, float>(unsigned long, float*, float*, float const*, float const*, float const*, bool)", "ph": "X", "tid": "17", "ts": 1600695897737724.8, "dur": 5.792, "pid": 0, "args": {"type": "cuLaunchKernel", "op_full_name": "Default/optimizer-Momentum/ApplyMomentum-op55", "block_dim": "1024,1,1", "grid_dim": "8,1,1"}}]

+ 1
- 0
tests/utils/resource/profiler/gpu_timeline_summary_0.json View File

@@ -0,0 +1 @@
{"total_time": 3875.638703999999, "num_of_streams": 1, "num_of_ops": 66, "op_exe_times": 66}

Loading…
Cancel
Save