From: @zhangyunshu Reviewed-by: @yelihua,@ouwenchang Signed-off-by: @yanghaitao1pull/1307/MERGE
| @@ -674,6 +674,34 @@ def get_cluster_step_trace_info(): | |||||
| return jsonify(step_trace_info) | return jsonify(step_trace_info) | ||||
| @BLUEPRINT.route("/profile/cluster-peak-memory", methods=["GET"]) | |||||
| def get_cluster_peak_memory(): | |||||
| """ | |||||
| Get cluster peak memory. | |||||
| Returns: | |||||
| str, the cluster peak memory. | |||||
| Raises: | |||||
| ParamValueError: If the cluster profiler dir is invalid. | |||||
| Examples: | |||||
| >>>GET http://xxx/v1/mindinsight/profile/cluster-peak-memory | |||||
| """ | |||||
| train_id = get_train_id(request) | |||||
| if not train_id: | |||||
| raise ParamValueError('No train id.') | |||||
| cluster_profiler_dir = os.path.join(settings.SUMMARY_BASE_DIR, train_id) | |||||
| cluster_profiler_dir = validate_and_normalize_path(cluster_profiler_dir, 'cluster_profiler') | |||||
| check_train_job_and_profiler_dir(cluster_profiler_dir) | |||||
| analyser = AnalyserFactory.instance().get_analyser( | |||||
| 'cluster_memory', cluster_profiler_dir | |||||
| ) | |||||
| peak_mem = analyser.get_peak_memory() | |||||
| return jsonify(peak_mem) | |||||
| def init_module(app): | def init_module(app): | ||||
| """ | """ | ||||
| Init module entry. | Init module entry. | ||||
| @@ -18,7 +18,7 @@ import os | |||||
| from mindinsight.profiler.analyser.base_analyser import BaseAnalyser | from mindinsight.profiler.analyser.base_analyser import BaseAnalyser | ||||
| from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ | from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \ | ||||
| ProfilerDirNotFoundException | |||||
| ProfilerDirNotFoundException, ProfilerIOException | |||||
| from mindinsight.profiler.common.log import logger as log | from mindinsight.profiler.common.log import logger as log | ||||
| from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path | from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path | ||||
| @@ -251,3 +251,57 @@ class ClusterStepTraceAnalyser(ClusterAnalyser): | |||||
| 'step_trace': result, | 'step_trace': result, | ||||
| 'size': self._cluster_step_trace_info_size | 'size': self._cluster_step_trace_info_size | ||||
| } | } | ||||
| class ClusterMemoryAnalyser(ClusterAnalyser): | |||||
| """The analyser for analyzing the cluster memory usage.""" | |||||
| _summary_filename = 'memory_usage_summary_{}.json' | |||||
| def __init__(self, cluster_profiler_dir, device_id='0'): | |||||
| super().__init__(cluster_profiler_dir, device_id) | |||||
| self._cluster_dir = os.path.join(cluster_profiler_dir, 'cluster_profiler') | |||||
| def get_peak_memory(self): | |||||
| """Get peak memory for each device.""" | |||||
| peak_mem_list = [] | |||||
| for host_map_ip, device_id, rank_id in self._host_device_rank_relation: | |||||
| host_dir = os.path.join(self._cluster_dir, host_map_ip, 'profiler') | |||||
| validate_and_normalize_path(host_dir, raise_key='Invalid host directory {}.'.format(host_map_ip)) | |||||
| file_path = self._get_memory_file_for_each_device(host_dir, device_id) | |||||
| file_content = self._get_file_content(file_path) | |||||
| capacity = file_content.get('capacity') | |||||
| peak_mem = file_content.get('peak_mem') | |||||
| mem_dict = { | |||||
| 'host_ip': host_map_ip, | |||||
| 'device_id': device_id, | |||||
| 'rank_id': rank_id, | |||||
| 'capacity': capacity, | |||||
| 'peak_mem': peak_mem | |||||
| } | |||||
| peak_mem_list.append(mem_dict) | |||||
| return peak_mem_list | |||||
| def _get_memory_file_for_each_device(self, path, device_id): | |||||
| """Get memory file for each device.""" | |||||
| filename = self._summary_filename.format(device_id) | |||||
| file_path = os.path.join(path, filename) | |||||
| validate_and_normalize_path( | |||||
| file_path, raise_key='Invalid memory usage file path.' | |||||
| ) | |||||
| return file_path | |||||
| @staticmethod | |||||
| def _get_file_content(file_path): | |||||
| """Get file content.""" | |||||
| try: | |||||
| with open(file_path, 'r') as f_obj: | |||||
| file_content = json.load(f_obj) | |||||
| except (IOError, OSError, json.JSONDecodeError) as err: | |||||
| log.error('Error occurred when read memory file: %s', err) | |||||
| raise ProfilerIOException() | |||||
| return file_content | |||||
| @@ -126,7 +126,7 @@ class MemoryUsageAnalyser(BaseAnalyser): | |||||
| file_content = json.load(f_obj) | file_content = json.load(f_obj) | ||||
| except (IOError, OSError, json.JSONDecodeError) as err: | except (IOError, OSError, json.JSONDecodeError) as err: | ||||
| logger.error('Error occurred when read memory file: %s', err) | logger.error('Error occurred when read memory file: %s', err) | ||||
| raise ProfilerIOException | |||||
| raise ProfilerIOException() | |||||
| return file_content | return file_content | ||||
| @@ -74,7 +74,7 @@ class TimelineAnalyser(BaseAnalyser): | |||||
| timeline = list(filter(lambda x: x, timeline)) | timeline = list(filter(lambda x: x, timeline)) | ||||
| except (IOError, OSError, json.JSONDecodeError) as err: | except (IOError, OSError, json.JSONDecodeError) as err: | ||||
| logger.error('Error occurred when read timeline display file: %s', err) | logger.error('Error occurred when read timeline display file: %s', err) | ||||
| raise ProfilerIOException | |||||
| raise ProfilerIOException() | |||||
| else: | else: | ||||
| logger.info('No timeline file. Please check the output path.') | logger.info('No timeline file. Please check the output path.') | ||||
| @@ -106,7 +106,7 @@ class TimelineAnalyser(BaseAnalyser): | |||||
| timeline_summary = json.load(f_obj) | timeline_summary = json.load(f_obj) | ||||
| except (IOError, OSError, json.JSONDecodeError) as err: | except (IOError, OSError, json.JSONDecodeError) as err: | ||||
| logger.error('Error occurred when read timeline summary file: %s', err) | logger.error('Error occurred when read timeline summary file: %s', err) | ||||
| raise ProfilerIOException | |||||
| raise ProfilerIOException() | |||||
| else: | else: | ||||
| logger.info('No timeline summary file. Please check the output path.') | logger.info('No timeline summary file. Please check the output path.') | ||||