Browse Source

profiler: add RESTful API for cluster memory

pull/1307/head
zhangyunshu 4 years ago
parent
commit
75cc7ad8ec
4 changed files with 86 additions and 4 deletions
  1. +28
    -0
      mindinsight/backend/profiler/profile_api.py
  2. +55
    -1
      mindinsight/profiler/analyser/cluster_analyser.py
  3. +1
    -1
      mindinsight/profiler/analyser/memory_usage_analyser.py
  4. +2
    -2
      mindinsight/profiler/analyser/timeline_analyser.py

+ 28
- 0
mindinsight/backend/profiler/profile_api.py View File

@@ -673,6 +673,34 @@ def get_cluster_step_trace_info():
return jsonify(step_trace_info)


@BLUEPRINT.route("/profile/cluster-peak-memory", methods=["GET"])
def get_cluster_peak_memory():
"""
Get cluster peak memory.

Returns:
str, the cluster peak memory.

Raises:
ParamValueError: If the cluster profiler dir is invalid.

Examples:
>>>GET http://xxx/v1/mindinsight/profile/cluster-peak-memory
"""
train_id = get_train_id(request)
if not train_id:
raise ParamValueError('No train id.')
cluster_profiler_dir = os.path.join(settings.SUMMARY_BASE_DIR, train_id)
cluster_profiler_dir = validate_and_normalize_path(cluster_profiler_dir, 'cluster_profiler')
check_train_job_and_profiler_dir(cluster_profiler_dir)

analyser = AnalyserFactory.instance().get_analyser(
'cluster_memory', cluster_profiler_dir
)
peak_mem = analyser.get_peak_memory()
return jsonify(peak_mem)


def init_module(app):
"""
Init module entry.


+ 55
- 1
mindinsight/profiler/analyser/cluster_analyser.py View File

@@ -18,7 +18,7 @@ import os

from mindinsight.profiler.analyser.base_analyser import BaseAnalyser
from mindinsight.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
ProfilerDirNotFoundException
ProfilerDirNotFoundException, ProfilerIOException
from mindinsight.profiler.common.log import logger as log
from mindinsight.profiler.common.validator.validate_path import validate_and_normalize_path

@@ -251,3 +251,57 @@ class ClusterStepTraceAnalyser(ClusterAnalyser):
'step_trace': result,
'size': self._cluster_step_trace_info_size
}


class ClusterMemoryAnalyser(ClusterAnalyser):
"""The analyser for analyzing the cluster memory usage."""
_summary_filename = 'memory_usage_summary_{}.json'

def __init__(self, cluster_profiler_dir, device_id='0'):
super().__init__(cluster_profiler_dir, device_id)
self._cluster_dir = os.path.join(cluster_profiler_dir, 'cluster_profiler')

def get_peak_memory(self):
"""Get peak memory for each device."""
peak_mem_list = []

for host_map_ip, device_id, rank_id in self._host_device_rank_relation:
host_dir = os.path.join(self._cluster_dir, host_map_ip, 'profiler')
validate_and_normalize_path(host_dir, raise_key='Invalid host directory {}.'.format(host_map_ip))
file_path = self._get_memory_file_for_each_device(host_dir, device_id)
file_content = self._get_file_content(file_path)
capacity = file_content.get('capacity')
peak_mem = file_content.get('peak_mem')

mem_dict = {
'host_ip': host_map_ip,
'device_id': device_id,
'rank_id': rank_id,
'capacity': capacity,
'peak_mem': peak_mem
}
peak_mem_list.append(mem_dict)

return peak_mem_list

def _get_memory_file_for_each_device(self, path, device_id):
"""Get memory file for each device."""
filename = self._summary_filename.format(device_id)
file_path = os.path.join(path, filename)
validate_and_normalize_path(
file_path, raise_key='Invalid memory usage file path.'
)

return file_path

@staticmethod
def _get_file_content(file_path):
"""Get file content."""
try:
with open(file_path, 'r') as f_obj:
file_content = json.load(f_obj)
except (IOError, OSError, json.JSONDecodeError) as err:
log.error('Error occurred when read memory file: %s', err)
raise ProfilerIOException()

return file_content

+ 1
- 1
mindinsight/profiler/analyser/memory_usage_analyser.py View File

@@ -126,7 +126,7 @@ class MemoryUsageAnalyser(BaseAnalyser):
file_content = json.load(f_obj)
except (IOError, OSError, json.JSONDecodeError) as err:
logger.error('Error occurred when read memory file: %s', err)
raise ProfilerIOException
raise ProfilerIOException()

return file_content



+ 2
- 2
mindinsight/profiler/analyser/timeline_analyser.py View File

@@ -69,7 +69,7 @@ class TimelineAnalyser(BaseAnalyser):
timeline = json.load(f_obj)
except (IOError, OSError, json.JSONDecodeError) as err:
logger.error('Error occurred when read timeline display file: %s', err)
raise ProfilerIOException
raise ProfilerIOException()
else:
logger.info('No timeline file. Please check the output path.')

@@ -101,7 +101,7 @@ class TimelineAnalyser(BaseAnalyser):
timeline_summary = json.load(f_obj)
except (IOError, OSError, json.JSONDecodeError) as err:
logger.error('Error occurred when read timeline summary file: %s', err)
raise ProfilerIOException
raise ProfilerIOException()
else:
logger.info('No timeline summary file. Please check the output path.')



Loading…
Cancel
Save