diff --git a/mindinsight/backend/profiler/profile_api.py b/mindinsight/backend/profiler/profile_api.py index e9c184d9..7a2b233a 100644 --- a/mindinsight/backend/profiler/profile_api.py +++ b/mindinsight/backend/profiler/profile_api.py @@ -674,6 +674,34 @@ def get_cluster_step_trace_info(): return jsonify(step_trace_info) +@BLUEPRINT.route("/profile/cluster-peak-memory", methods=["GET"]) +def get_cluster_peak_memory(): + """ + Get cluster peak memory. + + Returns: + str, the cluster peak memory. + + Raises: + ParamValueError: If the cluster profiler dir is invalid. + + Examples: + >>>GET http://xxx/v1/mindinsight/profile/cluster-peak-memory + """ + train_id = get_train_id(request) + if not train_id: + raise ParamValueError('No train id.') + cluster_profiler_dir = os.path.join(settings.SUMMARY_BASE_DIR, train_id) + cluster_profiler_dir = validate_and_normalize_path(cluster_profiler_dir, 'cluster_profiler') + check_train_job_and_profiler_dir(cluster_profiler_dir) + + analyser = AnalyserFactory.instance().get_analyser( + 'cluster_memory', cluster_profiler_dir + ) + peak_mem = analyser.get_peak_memory() + return jsonify(peak_mem) + + def init_module(app): """ Init module entry. diff --git a/mindinsight/profiler/analyser/cluster_analyser.py b/mindinsight/profiler/analyser/cluster_analyser.py index 3fffbf5a..94cece33 100644 --- a/mindinsight/profiler/analyser/cluster_analyser.py +++ b/mindinsight/profiler/analyser/cluster_analyser.py @@ -18,7 +18,7 @@ import os from mindinsight.profiler.analyser.base_analyser import BaseAnalyser from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ - ProfilerDirNotFoundException + ProfilerDirNotFoundException, ProfilerIOException from mindinsight.profiler.common.log import logger as log from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path @@ -251,3 +251,57 @@ class ClusterStepTraceAnalyser(ClusterAnalyser): 'step_trace': result, 'size': self._cluster_step_trace_info_size } + + +class ClusterMemoryAnalyser(ClusterAnalyser): + """The analyser for analyzing the cluster memory usage.""" + _summary_filename = 'memory_usage_summary_{}.json' + + def __init__(self, cluster_profiler_dir, device_id='0'): + super().__init__(cluster_profiler_dir, device_id) + self._cluster_dir = os.path.join(cluster_profiler_dir, 'cluster_profiler') + + def get_peak_memory(self): + """Get peak memory for each device.""" + peak_mem_list = [] + + for host_map_ip, device_id, rank_id in self._host_device_rank_relation: + host_dir = os.path.join(self._cluster_dir, host_map_ip, 'profiler') + validate_and_normalize_path(host_dir, raise_key='Invalid host directory {}.'.format(host_map_ip)) + file_path = self._get_memory_file_for_each_device(host_dir, device_id) + file_content = self._get_file_content(file_path) + capacity = file_content.get('capacity') + peak_mem = file_content.get('peak_mem') + + mem_dict = { + 'host_ip': host_map_ip, + 'device_id': device_id, + 'rank_id': rank_id, + 'capacity': capacity, + 'peak_mem': peak_mem + } + peak_mem_list.append(mem_dict) + + return peak_mem_list + + def _get_memory_file_for_each_device(self, path, device_id): + """Get memory file for each device.""" + filename = self._summary_filename.format(device_id) + file_path = os.path.join(path, filename) + validate_and_normalize_path( + file_path, raise_key='Invalid memory usage file path.' + ) + + return file_path + + @staticmethod + def _get_file_content(file_path): + """Get file content.""" + try: + with open(file_path, 'r') as f_obj: + file_content = json.load(f_obj) + except (IOError, OSError, json.JSONDecodeError) as err: + log.error('Error occurred when read memory file: %s', err) + raise ProfilerIOException() + + return file_content diff --git a/mindinsight/profiler/analyser/memory_usage_analyser.py b/mindinsight/profiler/analyser/memory_usage_analyser.py index fc2b6103..3ab22895 100644 --- a/mindinsight/profiler/analyser/memory_usage_analyser.py +++ b/mindinsight/profiler/analyser/memory_usage_analyser.py @@ -126,7 +126,7 @@ class MemoryUsageAnalyser(BaseAnalyser): file_content = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error('Error occurred when read memory file: %s', err) - raise ProfilerIOException + raise ProfilerIOException() return file_content diff --git a/mindinsight/profiler/analyser/timeline_analyser.py b/mindinsight/profiler/analyser/timeline_analyser.py index e7a83cf9..d30e5fd6 100644 --- a/mindinsight/profiler/analyser/timeline_analyser.py +++ b/mindinsight/profiler/analyser/timeline_analyser.py @@ -74,7 +74,7 @@ class TimelineAnalyser(BaseAnalyser): timeline = list(filter(lambda x: x, timeline)) except (IOError, OSError, json.JSONDecodeError) as err: logger.error('Error occurred when read timeline display file: %s', err) - raise ProfilerIOException + raise ProfilerIOException() else: logger.info('No timeline file. Please check the output path.') @@ -106,7 +106,7 @@ class TimelineAnalyser(BaseAnalyser): timeline_summary = json.load(f_obj) except (IOError, OSError, json.JSONDecodeError) as err: logger.error('Error occurred when read timeline summary file: %s', err) - raise ProfilerIOException + raise ProfilerIOException() else: logger.info('No timeline summary file. Please check the output path.')