From a525f61f680ab4473603cf3af16fbb5e86983b09 Mon Sep 17 00:00:00 2001 From: gzhcv Date: Tue, 22 Sep 2020 17:01:54 +0800 Subject: [PATCH] add st of profiler timeline --- .../profiler/analyser/timeline_analyser.py | 281 +----------------- .../func/profiler/test_timeline_analyser.py | 118 ++++++++ .../profiler/ascend_timeline_display_0.json | 1 + .../profiler/ascend_timeline_summary_0.json | 1 + .../profiler/gpu_timeline_display_0.json | 1 + .../profiler/gpu_timeline_summary_0.json | 1 + 6 files changed, 123 insertions(+), 280 deletions(-) create mode 100644 tests/st/func/profiler/test_timeline_analyser.py create mode 100644 tests/utils/resource/profiler/ascend_timeline_display_0.json create mode 100644 tests/utils/resource/profiler/ascend_timeline_summary_0.json create mode 100644 tests/utils/resource/profiler/gpu_timeline_display_0.json create mode 100644 tests/utils/resource/profiler/gpu_timeline_summary_0.json diff --git a/mindinsight/profiler/analyser/timeline_analyser.py b/mindinsight/profiler/analyser/timeline_analyser.py index 5af13797..80fecedb 100644 --- a/mindinsight/profiler/analyser/timeline_analyser.py +++ b/mindinsight/profiler/analyser/timeline_analyser.py @@ -17,75 +17,20 @@ import json import os from mindinsight.profiler.analyser.base_analyser import BaseAnalyser -from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ - ProfilerIOException +from mindinsight.profiler.common.exceptions.exceptions import ProfilerIOException from mindinsight.profiler.common.log import logger from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path from mindinsight.utils.exceptions import ParamValueError -SIZE_LIMIT = 20 * 1024 * 1024 # 20MB - - -class TimelineContainer: - """ - A container of operator computation metadata. - - Args: - split_list (list): The split list of metadata in op_compute output file. - """ - def __init__(self, split_list): - self._op_name = split_list[0] - self._stream_id = int(split_list[1]) - self._start_time = float(split_list[2]) - self._duration = float(split_list[3]) - self._pid = None - if len(split_list) == 5: - self._pid = int(split_list[4]) - - @property - def op_name(self): - """Get the name of the operator.""" - return self._op_name - - @property - def stream_id(self): - """Get the stream id of the operator.""" - return self._stream_id - - @property - def start_time(self): - """Get the execution start time of the operator.""" - return self._start_time - - @property - def duration(self): - """Get the duration of the operator execution.""" - return self._duration - - @property - def pid(self): - """Get the pid of the operator execution.""" - return self._pid - class TimelineAnalyser(BaseAnalyser): """ Analyse timeline data from file. """ - __col_names__ = ['op_name', 'stream_id', 'start_time', 'duration'] - _output_timeline_data_file_path = 'output_timeline_data_{}.txt' - _min_cycle_counter_file_path = 'min_cycle_counter_{}.txt' _ascend_display_filename = 'ascend_timeline_display_{}.json' _gpu_display_filename = 'gpu_timeline_display_{}.json' _ascend_timeline_summary_filename = 'ascend_timeline_summary_{}.json' _gpu_timeline_summary_filename = 'gpu_timeline_summary_{}.json' - _timeline_meta = [] - _timeline_summary = { - 'total_time': 0, - 'num_of_streams': 0, - 'num_of_ops': 0, - 'op_exe_times': 0 - } def _load(self): """Load data according to the parsed profiling files.""" @@ -161,227 +106,3 @@ class TimelineAnalyser(BaseAnalyser): logger.info('No timeline summary file. Please check the output path.') return timeline_summary - - def write_timeline(self): - """Load data according to the parsed profiling files.""" - # Write timeline to file. - logger.info('Writing timeline file...') - self.write_timeline_to_json_by_limitation() - logger.info('Finished file writing!') - - def write_timeline_to_json_by_limitation(self): - """Write timeline to json by limitation.""" - display_filename = self._display_filename.format(self._device_id) - display_file_path = os.path.join( - self._profiling_dir, - display_filename - ) - display_file_path = validate_and_normalize_path( - display_file_path, raise_key='Invalid timeline display json path.' - ) - - length = len(self._timeline_meta) - try: - with open(display_file_path, 'w') as json_file: - json_file.write('[') - for index, item in enumerate(self._timeline_meta): - json.dump(item, json_file) - file_size = os.path.getsize(display_file_path) - if file_size > SIZE_LIMIT: - break - if index == length - 1: - break - json_file.write(',') - json_file.write(']') - except (IOError, OSError) as err: - logger.error('Error occurred when write timeline display file: %s', err) - raise ProfilerIOException - - def write_timeline_summary(self): - """Write timeline summary to json.""" - timeline_summary_file_path = os.path.join( - self._profiling_dir, - self._timeline_summary_filename.format(self._device_id) - ) - - timeline_summary_file_path = validate_and_normalize_path( - timeline_summary_file_path, raise_key='Invalid timeline summary path.' - ) - - try: - with open(timeline_summary_file_path, 'w') as json_file: - json.dump(self._timeline_summary, json_file) - except (IOError, OSError) as err: - logger.error('Error occurred when write timeline summary file: %s', err) - raise ProfilerIOException - - def _load_timeline_data(self): - """Load timeline data from file.""" - file_path = os.path.join( - self._profiling_dir, - self._output_timeline_data_file_path.format(self._device_id) - ) - file_path = validate_and_normalize_path( - file_path, raise_key='Invalid timeline txt file path.' - ) - if not os.path.exists(file_path): - logger.error("Failed to find parsed timeline file.") - raise ProfilerFileNotFoundException('parsed timeline file') - - timeline_list = [] - try: - with open(file_path, 'r') as f_obj: - for line in f_obj: - if not line.startswith('op_name'): - line_list = line.strip('\n').split(',') - timeline_list.append(line_list) - except (IOError, OSError) as err: - logger.error('Error occurred when read timeline intermediate file: %s', err) - raise ProfilerIOException - - return timeline_list - - def _parse_timeline_data(self, timeline, min_cycle_counter): - """Parse timeline data.""" - # factor to convert the time unit from 1ms to 1us for timeline display - factor = 1000 - op_meta = TimelineContainer(timeline) - timeline_dict = {} - timeline_dict['name'] = op_meta.op_name - timeline_dict['ph'] = 'X' - timeline_dict['tid'] = op_meta.stream_id - timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) * factor - dur = op_meta.duration * factor - timeline_dict['dur'] = dur - if op_meta.pid is None: - timeline_dict['pid'] = int(self._device_id) - # Update total time of operator execution. - self._timeline_summary['total_time'] += dur - else: # AllReduce and AI CPU pid - timeline_dict['pid'] = op_meta.pid - self._timeline_meta.append(timeline_dict) - - @staticmethod - def _update_num_of_streams(timeline, stream_count_dict): - """Update number of streams.""" - stream_id = timeline[1] - if stream_id not in stream_count_dict.keys(): - stream_count_dict[stream_id] = 1 - else: - stream_count_dict[stream_id] += 1 - - def get_min_cycle_counter(self): - """ - Get minimum cycle counter. - - Returns: - float, the minimum value of the cycle counter. - """ - file_path = os.path.join( - self._profiling_dir, - self._min_cycle_counter_file_path.format(self._device_id) - ) - - file_path = validate_and_normalize_path( - file_path, raise_key='Invalid min cycle counter file path.' - ) - - if os.path.exists(file_path): - try: - with open(file_path, 'r') as f_obj: - min_cycle_counter = f_obj.read() - min_cycle_counter = float(min_cycle_counter) \ - if not min_cycle_counter == 'inf' else 0 - except (IOError, OSError) as err: - logger.error('Error occurred when read minimum cycle counter: %s', err) - raise ProfilerIOException - else: - min_cycle_counter = 0 - logger.info("No min cycle counter recorded.") - - return min_cycle_counter - - def init_timeline(self, all_reduce_info, framework_info, aicpu_info, min_cycle_counter): - """ - Init timeline metadata, adding all collected info. - - Args: - all_reduce_info (list[list]): The metadata of AllReduce operator. - framework_info (dict): The framework metadata. - aicpu_info (dict): The metadata of AI CPU operator. - min_cycle_counter (float): The minimum cycle counter of the timeline. - """ - if min_cycle_counter == float('inf'): - min_cycle_counter = 0 - - logger.info('Initiating timeline...') - timeline_list = self._load_timeline_data() - self._timeline_summary['op_exe_times'] = len(timeline_list) - - # Add AllReduce info to timeline temp list and sort by start time. - if all_reduce_info: - logger.debug('AllReduce info found. Start adding info into timeline...') - timeline_list.extend(all_reduce_info) - timeline_list.sort(key=lambda x: float(x[2])) - - # Add AI CPU data into timeline temp list and sort by start time. - aicpu_data = aicpu_info.get('info') - if aicpu_data: - timeline_list.extend(aicpu_data) - timeline_list.sort(key=lambda x: float(x[2])) - self._timeline_summary['op_exe_times'] += aicpu_info.get('op_exe_times', 0) - self._timeline_summary['num_of_streams'] += aicpu_info.get('num_of_streams', 0) - self._timeline_summary['num_of_ops'] += aicpu_info.get('num_of_ops', 0) - self._timeline_summary['total_time'] += aicpu_info.get('total_time', 0) - - # Init a dict for counting the num of streams. - stream_count_dict = {} - for timeline in timeline_list: - self._parse_timeline_data(timeline, min_cycle_counter) - # Updating the collection of streams. - if len(timeline) == 4: - self._update_num_of_streams(timeline, stream_count_dict) - - # Get framework metadata. - framework_obj_list = framework_info.get('object') - # The length of list is the number of operators. - self._timeline_summary['num_of_ops'] += len(framework_obj_list) - self._add_framework_info(framework_obj_list) - logger.info('Finished adding info into timeline...') - - # Update timeline summary info - self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) - - def _add_framework_info(self, framework_obj_list): - """ - Add framework info into timeline metadata. - - Args: - framework_obj_list (list): The framework metadata. - """ - logger.debug('Start adding framework info into timeline...') - # Get the framework info that will be written into timeline. - framework_info_dict = {} - for framework_obj in framework_obj_list: - op_name = framework_obj[0] - op_type = framework_obj[1] - op_full_name = framework_obj[4] - op_info = framework_obj[5] - framework_info_dict[op_full_name] = { - 'name': op_name, - 'args': { - 'type': op_type, - 'fullname': op_full_name - } - } - framework_info_dict[op_full_name]['args'].update(op_info) - - # Insert framework info into timeline. - for timeline_item in self._timeline_meta: - op_full_name = timeline_item.get('name') - framework_item = framework_info_dict.get(op_full_name) - if framework_item: - timeline_item['name'] = framework_item.get('name') - timeline_item['args'] = framework_item.get('args') - - logger.debug('Finished adding framework info into timeline...') diff --git a/tests/st/func/profiler/test_timeline_analyser.py b/tests/st/func/profiler/test_timeline_analyser.py new file mode 100644 index 00000000..1908861e --- /dev/null +++ b/tests/st/func/profiler/test_timeline_analyser.py @@ -0,0 +1,118 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +""" +Fuction: + Test profiler to watch the performance of training. +Usage: + pytest tests/st/func/profiler +""" +import os + +import pytest + +from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory +from . import PROFILER_DIR +from ....utils.tools import compare_result_with_file + +class TestTimelineAnalyser: + """Test timeline analyser module.""" + + @classmethod + def setup_class(cls): + """Generate parsed files.""" + cls.profiler = PROFILER_DIR + cls.device_id = 0 + cls.ascend_display_filename = 'ascend_timeline_display_{}.json' + cls.gpu_display_filename = 'gpu_timeline_display_{}.json' + cls.ascend_timeline_summary_filename = 'ascend_timeline_summary_{}.json' + cls.gpu_timeline_summary_filename = 'gpu_timeline_summary_{}.json' + + def setup_method(self): + """Create analyser.""" + self._analyser = AnalyserFactory.instance().get_analyser( + 'timeline', self.profiler, self.device_id) + + + @pytest.mark.level0 + @pytest.mark.env_single + @pytest.mark.platform_x86_cpu + @pytest.mark.platform_arm_ascend_training + @pytest.mark.platform_x86_gpu_training + @pytest.mark.platform_x86_ascend_training + def test_get_display_timeline(self): + """Test the function of get timeline data for UI display.""" + gpu_file_path = os.path.join( + self.profiler, + self.gpu_display_filename.format(self.device_id) + ) + ascend_file_path = os.path.join( + self.profiler, + self.ascend_display_filename.format(self.device_id) + ) + + result = self._analyser.get_display_timeline("gpu") + compare_result_with_file(result, gpu_file_path) + + result = self._analyser.get_display_timeline("ascend") + compare_result_with_file(result, ascend_file_path) + + @pytest.mark.level0 + @pytest.mark.env_single + @pytest.mark.platform_x86_cpu + @pytest.mark.platform_arm_ascend_training + @pytest.mark.platform_x86_gpu_training + @pytest.mark.platform_x86_ascend_training + def test_get_timeline_summary(self): + """Test the function of get timeline data for UI display.""" + gpu_file_path = os.path.join( + self.profiler, + self.gpu_timeline_summary_filename.format(self.device_id) + ) + ascend_file_path = os.path.join( + self.profiler, + self.ascend_timeline_summary_filename.format(self.device_id) + ) + + result = self._analyser.get_timeline_summary("gpu") + compare_result_with_file(result, gpu_file_path) + + result = self._analyser.get_timeline_summary("ascend") + compare_result_with_file(result, ascend_file_path) + + @pytest.mark.level0 + @pytest.mark.env_single + @pytest.mark.platform_x86_cpu + @pytest.mark.platform_arm_ascend_training + @pytest.mark.platform_x86_gpu_training + @pytest.mark.platform_x86_ascend_training + def test_timeline_summary_file_not_exist(self): + """Test the function of get timeline data for UI display.""" + device_id = 1 + analyser = AnalyserFactory.instance().get_analyser( + 'timeline', self.profiler, device_id) + analyser.get_timeline_summary("gpu") + + @pytest.mark.level0 + @pytest.mark.env_single + @pytest.mark.platform_x86_cpu + @pytest.mark.platform_arm_ascend_training + @pytest.mark.platform_x86_gpu_training + @pytest.mark.platform_x86_ascend_training + def test_timeline_display_file_not_exist(self): + """Test the function of get timeline data for UI display.""" + device_id = 1 + analyser = AnalyserFactory.instance().get_analyser( + 'timeline', self.profiler, device_id) + analyser.get_display_timeline("gpu") diff --git a/tests/utils/resource/profiler/ascend_timeline_display_0.json b/tests/utils/resource/profiler/ascend_timeline_display_0.json new file mode 100644 index 00000000..0041f61c --- /dev/null +++ b/tests/utils/resource/profiler/ascend_timeline_display_0.json @@ -0,0 +1 @@ +[{"name": "InitData", "ph": "X", "tid": "0", "ts": 0.0, "dur": 159.9999964237213, "pid": 9000},{"name": "GetNext", "ph": "X", "tid": "0", "ts": 6042138.9999985695, "dur": 517.9999768733978, "pid": 9000},{"name": "AssignAdd-op142", "ph": "X", "tid": "519", "ts": 6042909.559994936, "dur": 3.1, "pid": 1, "args": {"type": "AssignAdd", "fullname": "Default/AssignAdd-op142", "input_0": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}, "input_1": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}, "output_0": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}}}] diff --git a/tests/utils/resource/profiler/ascend_timeline_summary_0.json b/tests/utils/resource/profiler/ascend_timeline_summary_0.json new file mode 100644 index 00000000..256734c2 --- /dev/null +++ b/tests/utils/resource/profiler/ascend_timeline_summary_0.json @@ -0,0 +1 @@ +{"total_time": 23147.013999999992, "num_of_streams": 2, "num_of_ops": 213, "op_exe_times": 112} \ No newline at end of file diff --git a/tests/utils/resource/profiler/gpu_timeline_display_0.json b/tests/utils/resource/profiler/gpu_timeline_display_0.json new file mode 100644 index 00000000..2236a100 --- /dev/null +++ b/tests/utils/resource/profiler/gpu_timeline_display_0.json @@ -0,0 +1 @@ +[{"name": "Default/InitDataSetQueue-op0", "ph": "X", "tid": "Ops", "ts": 1600695893269618.2, "dur": 303.84, "pid": 0},{"name": "void MomentumUpdateVariableKernel(unsigned long, float*, float*, float const*, float const*, float const*, bool)", "ph": "X", "tid": "17", "ts": 1600695897737724.8, "dur": 5.792, "pid": 0, "args": {"type": "cuLaunchKernel", "op_full_name": "Default/optimizer-Momentum/ApplyMomentum-op55", "block_dim": "1024,1,1", "grid_dim": "8,1,1"}}] diff --git a/tests/utils/resource/profiler/gpu_timeline_summary_0.json b/tests/utils/resource/profiler/gpu_timeline_summary_0.json new file mode 100644 index 00000000..ee975dbd --- /dev/null +++ b/tests/utils/resource/profiler/gpu_timeline_summary_0.json @@ -0,0 +1 @@ +{"total_time": 3875.638703999999, "num_of_streams": 1, "num_of_ops": 66, "op_exe_times": 66} \ No newline at end of file