add st of profiler timeline

5 years ago · a525f61f68
--- a/mindinsight/profiler/analyser/timeline_analyser.py
+++ b/mindinsight/profiler/analyser/timeline_analyser.py
@@ -17,75 +17,20 @@ import json
 import os

 from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
 from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
    ProfilerIOException
 from mindinsight.profiler.common.exceptions.exceptions import ProfilerIOException
 from mindinsight.profiler.common.log import logger
 from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path
 from mindinsight.utils.exceptions import ParamValueError

 SIZE_LIMIT = 20 * 1024 * 1024  # 20MB


 class TimelineContainer:
    """
    A container of operator computation metadata.

    Args:
        split_list (list): The split list of metadata in op_compute output file.
    """
    def __init__(self, split_list):
        self._op_name = split_list[0]
        self._stream_id = int(split_list[1])
        self._start_time = float(split_list[2])
        self._duration = float(split_list[3])
        self._pid = None
        if len(split_list) == 5:
            self._pid = int(split_list[4])

    @property
    def op_name(self):
        """Get the name of the operator."""
        return self._op_name

    @property
    def stream_id(self):
        """Get the stream id of the operator."""
        return self._stream_id

    @property
    def start_time(self):
        """Get the execution start time of the operator."""
        return self._start_time

    @property
    def duration(self):
        """Get the duration of the operator execution."""
        return self._duration

    @property
    def pid(self):
        """Get the pid of the operator execution."""
        return self._pid


 class TimelineAnalyser(BaseAnalyser):
    """
    Analyse timeline data from file.
    """
    __col_names__ = ['op_name', 'stream_id', 'start_time', 'duration']
    _output_timeline_data_file_path = 'output_timeline_data_{}.txt'
    _min_cycle_counter_file_path = 'min_cycle_counter_{}.txt'
    _ascend_display_filename = 'ascend_timeline_display_{}.json'
    _gpu_display_filename = 'gpu_timeline_display_{}.json'
    _ascend_timeline_summary_filename = 'ascend_timeline_summary_{}.json'
    _gpu_timeline_summary_filename = 'gpu_timeline_summary_{}.json'
    _timeline_meta = []
    _timeline_summary = {
        'total_time': 0,
        'num_of_streams': 0,
        'num_of_ops': 0,
        'op_exe_times': 0
    }

    def _load(self):
        """Load data according to the parsed profiling files."""
@@ -161,227 +106,3 @@ class TimelineAnalyser(BaseAnalyser):
            logger.info('No timeline summary file. Please check the output path.')

        return timeline_summary

    def write_timeline(self):
        """Load data according to the parsed profiling files."""
        # Write timeline to file.
        logger.info('Writing timeline file...')
        self.write_timeline_to_json_by_limitation()
        logger.info('Finished file writing!')

    def write_timeline_to_json_by_limitation(self):
        """Write timeline to json by limitation."""
        display_filename = self._display_filename.format(self._device_id)
        display_file_path = os.path.join(
            self._profiling_dir,
            display_filename
        )
        display_file_path = validate_and_normalize_path(
            display_file_path, raise_key='Invalid timeline display json path.'
        )

        length = len(self._timeline_meta)
        try:
            with open(display_file_path, 'w') as json_file:
                json_file.write('[')
                for index, item in enumerate(self._timeline_meta):
                    json.dump(item, json_file)
                    file_size = os.path.getsize(display_file_path)
                    if file_size > SIZE_LIMIT:
                        break
                    if index == length - 1:
                        break
                    json_file.write(',')
                json_file.write(']')
        except (IOError, OSError) as err:
            logger.error('Error occurred when write timeline display file: %s', err)
            raise ProfilerIOException

    def write_timeline_summary(self):
        """Write timeline summary to json."""
        timeline_summary_file_path = os.path.join(
            self._profiling_dir,
            self._timeline_summary_filename.format(self._device_id)
        )

        timeline_summary_file_path = validate_and_normalize_path(
            timeline_summary_file_path, raise_key='Invalid timeline summary path.'
        )

        try:
            with open(timeline_summary_file_path, 'w') as json_file:
                json.dump(self._timeline_summary, json_file)
        except (IOError, OSError) as err:
            logger.error('Error occurred when write timeline summary file: %s', err)
            raise ProfilerIOException

    def _load_timeline_data(self):
        """Load timeline data from file."""
        file_path = os.path.join(
            self._profiling_dir,
            self._output_timeline_data_file_path.format(self._device_id)
        )
        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid timeline txt file path.'
        )
        if not os.path.exists(file_path):
            logger.error("Failed to find parsed timeline file.")
            raise ProfilerFileNotFoundException('parsed timeline file')

        timeline_list = []
        try:
            with open(file_path, 'r') as f_obj:
                for line in f_obj:
                    if not line.startswith('op_name'):
                        line_list = line.strip('\n').split(',')
                        timeline_list.append(line_list)
        except (IOError, OSError) as err:
            logger.error('Error occurred when read timeline intermediate file: %s', err)
            raise ProfilerIOException

        return timeline_list

    def _parse_timeline_data(self, timeline, min_cycle_counter):
        """Parse timeline data."""
        # factor to convert the time unit from 1ms to 1us for timeline display
        factor = 1000
        op_meta = TimelineContainer(timeline)
        timeline_dict = {}
        timeline_dict['name'] = op_meta.op_name
        timeline_dict['ph'] = 'X'
        timeline_dict['tid'] = op_meta.stream_id
        timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) * factor
        dur = op_meta.duration * factor
        timeline_dict['dur'] = dur
        if op_meta.pid is None:
            timeline_dict['pid'] = int(self._device_id)
            # Update total time of operator execution.
            self._timeline_summary['total_time'] += dur
        else:  # AllReduce and AI CPU pid
            timeline_dict['pid'] = op_meta.pid
        self._timeline_meta.append(timeline_dict)

    @staticmethod
    def _update_num_of_streams(timeline, stream_count_dict):
        """Update number of streams."""
        stream_id = timeline[1]
        if stream_id not in stream_count_dict.keys():
            stream_count_dict[stream_id] = 1
        else:
            stream_count_dict[stream_id] += 1

    def get_min_cycle_counter(self):
        """
        Get minimum cycle counter.

        Returns:
            float, the minimum value of the cycle counter.
        """
        file_path = os.path.join(
            self._profiling_dir,
            self._min_cycle_counter_file_path.format(self._device_id)
        )

        file_path = validate_and_normalize_path(
            file_path, raise_key='Invalid min cycle counter file path.'
        )

        if os.path.exists(file_path):
            try:
                with open(file_path, 'r') as f_obj:
                    min_cycle_counter = f_obj.read()
                    min_cycle_counter = float(min_cycle_counter) \
                        if not min_cycle_counter == 'inf' else 0
            except (IOError, OSError) as err:
                logger.error('Error occurred when read minimum cycle counter: %s', err)
                raise ProfilerIOException
        else:
            min_cycle_counter = 0
            logger.info("No min cycle counter recorded.")

        return min_cycle_counter

    def init_timeline(self, all_reduce_info, framework_info, aicpu_info, min_cycle_counter):
        """
        Init timeline metadata, adding all collected info.

        Args:
            all_reduce_info (list[list]): The metadata of AllReduce operator.
            framework_info (dict): The framework metadata.
            aicpu_info (dict): The metadata of AI CPU operator.
            min_cycle_counter (float): The minimum cycle counter of the timeline.
        """
        if min_cycle_counter == float('inf'):
            min_cycle_counter = 0

        logger.info('Initiating timeline...')
        timeline_list = self._load_timeline_data()
        self._timeline_summary['op_exe_times'] = len(timeline_list)

        # Add AllReduce info to timeline temp list and sort by start time.
        if all_reduce_info:
            logger.debug('AllReduce info found. Start adding info into timeline...')
            timeline_list.extend(all_reduce_info)
            timeline_list.sort(key=lambda x: float(x[2]))

        # Add AI CPU data into timeline temp list and sort by start time.
        aicpu_data = aicpu_info.get('info')
        if aicpu_data:
            timeline_list.extend(aicpu_data)
            timeline_list.sort(key=lambda x: float(x[2]))
            self._timeline_summary['op_exe_times'] += aicpu_info.get('op_exe_times', 0)
            self._timeline_summary['num_of_streams'] += aicpu_info.get('num_of_streams', 0)
            self._timeline_summary['num_of_ops'] += aicpu_info.get('num_of_ops', 0)
            self._timeline_summary['total_time'] += aicpu_info.get('total_time', 0)

        # Init a dict for counting the num of streams.
        stream_count_dict = {}
        for timeline in timeline_list:
            self._parse_timeline_data(timeline, min_cycle_counter)
            # Updating the collection of streams.
            if len(timeline) == 4:
                self._update_num_of_streams(timeline, stream_count_dict)

        # Get framework metadata.
        framework_obj_list = framework_info.get('object')
        # The length of list is the number of operators.
        self._timeline_summary['num_of_ops'] += len(framework_obj_list)
        self._add_framework_info(framework_obj_list)
        logger.info('Finished adding info into timeline...')

        # Update timeline summary info
        self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())

    def _add_framework_info(self, framework_obj_list):
        """
        Add framework info into timeline metadata.

        Args:
            framework_obj_list (list): The framework metadata.
        """
        logger.debug('Start adding framework info into timeline...')
        # Get the framework info that will be written into timeline.
        framework_info_dict = {}
        for framework_obj in framework_obj_list:
            op_name = framework_obj[0]
            op_type = framework_obj[1]
            op_full_name = framework_obj[4]
            op_info = framework_obj[5]
            framework_info_dict[op_full_name] = {
                'name': op_name,
                'args': {
                    'type': op_type,
                    'fullname': op_full_name
                }
            }
            framework_info_dict[op_full_name]['args'].update(op_info)

        # Insert framework info into timeline.
        for timeline_item in self._timeline_meta:
            op_full_name = timeline_item.get('name')
            framework_item = framework_info_dict.get(op_full_name)
            if framework_item:
                timeline_item['name'] = framework_item.get('name')
                timeline_item['args'] = framework_item.get('args')

        logger.debug('Finished adding framework info into timeline...')
--- a/tests/st/func/profiler/test_timeline_analyser.py
+++ b/tests/st/func/profiler/test_timeline_analyser.py
@@ -0,0 +1,118 @@
 # Copyright 2020 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 """
 Fuction:
    Test profiler to watch the performance of training.
 Usage:
    pytest tests/st/func/profiler
 """
 import os

 import pytest

 from mindinsight.profiler.analyser.analyser_factory import AnalyserFactory
 from . import PROFILER_DIR
 from ....utils.tools import compare_result_with_file

 class TestTimelineAnalyser:
    """Test timeline analyser module."""

    @classmethod
    def setup_class(cls):
        """Generate parsed files."""
        cls.profiler = PROFILER_DIR
        cls.device_id = 0
        cls.ascend_display_filename = 'ascend_timeline_display_{}.json'
        cls.gpu_display_filename = 'gpu_timeline_display_{}.json'
        cls.ascend_timeline_summary_filename = 'ascend_timeline_summary_{}.json'
        cls.gpu_timeline_summary_filename = 'gpu_timeline_summary_{}.json'

    def setup_method(self):
        """Create analyser."""
        self._analyser = AnalyserFactory.instance().get_analyser(
            'timeline', self.profiler, self.device_id)


    @pytest.mark.level0
    @pytest.mark.env_single
    @pytest.mark.platform_x86_cpu
    @pytest.mark.platform_arm_ascend_training
    @pytest.mark.platform_x86_gpu_training
    @pytest.mark.platform_x86_ascend_training
    def test_get_display_timeline(self):
        """Test the function of get timeline data for UI display."""
        gpu_file_path = os.path.join(
            self.profiler,
            self.gpu_display_filename.format(self.device_id)
        )
        ascend_file_path = os.path.join(
            self.profiler,
            self.ascend_display_filename.format(self.device_id)
        )

        result = self._analyser.get_display_timeline("gpu")
        compare_result_with_file(result, gpu_file_path)

        result = self._analyser.get_display_timeline("ascend")
        compare_result_with_file(result, ascend_file_path)

    @pytest.mark.level0
    @pytest.mark.env_single
    @pytest.mark.platform_x86_cpu
    @pytest.mark.platform_arm_ascend_training
    @pytest.mark.platform_x86_gpu_training
    @pytest.mark.platform_x86_ascend_training
    def test_get_timeline_summary(self):
        """Test the function of get timeline data for UI display."""
        gpu_file_path = os.path.join(
            self.profiler,
            self.gpu_timeline_summary_filename.format(self.device_id)
        )
        ascend_file_path = os.path.join(
            self.profiler,
            self.ascend_timeline_summary_filename.format(self.device_id)
        )

        result = self._analyser.get_timeline_summary("gpu")
        compare_result_with_file(result, gpu_file_path)

        result = self._analyser.get_timeline_summary("ascend")
        compare_result_with_file(result, ascend_file_path)

    @pytest.mark.level0
    @pytest.mark.env_single
    @pytest.mark.platform_x86_cpu
    @pytest.mark.platform_arm_ascend_training
    @pytest.mark.platform_x86_gpu_training
    @pytest.mark.platform_x86_ascend_training
    def test_timeline_summary_file_not_exist(self):
        """Test the function of get timeline data for UI display."""
        device_id = 1
        analyser = AnalyserFactory.instance().get_analyser(
            'timeline', self.profiler, device_id)
        analyser.get_timeline_summary("gpu")

    @pytest.mark.level0
    @pytest.mark.env_single
    @pytest.mark.platform_x86_cpu
    @pytest.mark.platform_arm_ascend_training
    @pytest.mark.platform_x86_gpu_training
    @pytest.mark.platform_x86_ascend_training
    def test_timeline_display_file_not_exist(self):
        """Test the function of get timeline data for UI display."""
        device_id = 1
        analyser = AnalyserFactory.instance().get_analyser(
            'timeline', self.profiler, device_id)
        analyser.get_display_timeline("gpu")
--- a/tests/utils/resource/profiler/ascend_timeline_display_0.json
+++ b/tests/utils/resource/profiler/ascend_timeline_display_0.json
@@ -0,0 +1 @@
 [{"name": "InitData", "ph": "X", "tid": "0", "ts": 0.0, "dur": 159.9999964237213, "pid": 9000},{"name": "GetNext", "ph": "X", "tid": "0", "ts": 6042138.9999985695, "dur": 517.9999768733978, "pid": 9000},{"name": "AssignAdd-op142", "ph": "X", "tid": "519", "ts": 6042909.559994936, "dur": 3.1, "pid": 1, "args": {"type": "AssignAdd", "fullname": "Default/AssignAdd-op142", "input_0": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}, "input_1": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}, "output_0": {"format": "DefaultFormat", "data_type": "NUMBER_TYPE_UINT8", "shape": "1"}}}]
--- a/tests/utils/resource/profiler/ascend_timeline_summary_0.json
+++ b/tests/utils/resource/profiler/ascend_timeline_summary_0.json
@@ -0,0 +1 @@
 {"total_time": 23147.013999999992, "num_of_streams": 2, "num_of_ops": 213, "op_exe_times": 112}
--- a/tests/utils/resource/profiler/gpu_timeline_display_0.json
+++ b/tests/utils/resource/profiler/gpu_timeline_display_0.json
@@ -0,0 +1 @@
 [{"name": "Default/InitDataSetQueue-op0", "ph": "X", "tid": "Ops", "ts": 1600695893269618.2, "dur": 303.84, "pid": 0},{"name": "void MomentumUpdateVariableKernel<float, float, float>(unsigned long, float*, float*, float const*, float const*, float const*, bool)", "ph": "X", "tid": "17", "ts": 1600695897737724.8, "dur": 5.792, "pid": 0, "args": {"type": "cuLaunchKernel", "op_full_name": "Default/optimizer-Momentum/ApplyMomentum-op55", "block_dim": "1024,1,1", "grid_dim": "8,1,1"}}]
--- a/tests/utils/resource/profiler/gpu_timeline_summary_0.json
+++ b/tests/utils/resource/profiler/gpu_timeline_summary_0.json
@@ -0,0 +1 @@
 {"total_time": 3875.638703999999, "num_of_streams": 1, "num_of_ops": 66, "op_exe_times": 66}