Browse Source

!427 Hardware resource monitor API for getting Ascend, CPU, memory metrics

Merge pull request !427 from LiHongzhang/hw_board
tags/v0.6.0-beta
mindspore-ci-bot Gitee 5 years ago
parent
commit
8e0d7de8bd
11 changed files with 525 additions and 0 deletions
  1. +1
    -0
      mindinsight/backend/application.py
  2. +2
    -0
      mindinsight/backend/datavisual/__init__.py
  3. +39
    -0
      mindinsight/backend/datavisual/sysmetric_api.py
  4. +42
    -0
      mindinsight/sysmetric/collector/__init__.py
  5. +37
    -0
      mindinsight/sysmetric/collector/_collect_cpu.py
  6. +34
    -0
      mindinsight/sysmetric/collector/_collect_mem.py
  7. +281
    -0
      mindinsight/sysmetric/collector/_collect_npu.py
  8. +14
    -0
      mindinsight/sysmetric/common/__init__.py
  9. +18
    -0
      mindinsight/sysmetric/common/log.py
  10. +15
    -0
      tests/ut/sysmetric/__init__.py
  11. +42
    -0
      tests/ut/sysmetric/metrics_collector.py

+ 1
- 0
mindinsight/backend/application.py View File

@@ -111,6 +111,7 @@ def create_app():
static_folder_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, 'ui', 'dist', 'static'))

app = Flask(__name__, static_url_path=static_url_path, static_folder=static_folder_path)
app.config['JSON_SORT_KEYS'] = False

if settings.ENABLE_CORS:
CORS(app, supports_credentials=True)


+ 2
- 0
mindinsight/backend/datavisual/__init__.py View File

@@ -17,6 +17,7 @@
from mindinsight.backend.datavisual.static_resource_api import init_module as static_init_module
from mindinsight.backend.datavisual.task_manager_api import init_module as task_init_module
from mindinsight.backend.datavisual.train_visual_api import init_module as train_init_module
from mindinsight.backend.datavisual.sysmetric_api import init_module as sysmetric_init_module


def init_module(app):
@@ -30,3 +31,4 @@ def init_module(app):
static_init_module(app)
task_init_module(app)
train_init_module(app)
sysmetric_init_module(app)

+ 39
- 0
mindinsight/backend/datavisual/sysmetric_api.py View File

@@ -0,0 +1,39 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""System metrics API."""

from flask import Blueprint, jsonify
from mindinsight.conf import settings
from mindinsight.sysmetric.collector import get_metrics

BLUEPRINT = Blueprint("sysmetric", __name__, url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX)


@BLUEPRINT.route("/sysmetric/current", methods=["GET"])
def query_sysmetric():
"""Query the system metrics."""

return jsonify(get_metrics())


def init_module(app):
"""
Init module entry.

Args:
app: the application obj.

"""
app.register_blueprint(BLUEPRINT)

+ 42
- 0
mindinsight/sysmetric/collector/__init__.py View File

@@ -0,0 +1,42 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The metrics collector."""

from ._collect_cpu import collect_cpu
from ._collect_mem import collect_mem
from ._collect_npu import collect_npu

__all__ = [
'collect_cpu',
'collect_mem',
'collect_npu',
]


def get_metrics():
mem = collect_mem()
return {
'npu': collect_npu(),
'cpu': {
'overall': collect_cpu(percent=True),
'percpu': collect_cpu(percpu=True, percent=True)
},
'memory': {
'virtual': {
'available': mem.get('available'),
'used': mem.get('used')
}
}
}

+ 37
- 0
mindinsight/sysmetric/collector/_collect_cpu.py View File

@@ -0,0 +1,37 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The cpu collector."""

import psutil


def collect_cpu(percpu=False, percent=False):
"""
Collect the cpu info.

Args:
percpu (bool): To return a list of cpu info for each logical CPU on the system.
percent (bool): Represent the sized in percentage.

Returns:
Union[dict, List[dict]], the CPUs info.
"""
if percent:
times = psutil.cpu_times_percent(percpu=percpu)
else:
times = psutil.cpu_times(percpu=percpu)
if not percpu:
return dict(times._asdict())
return [dict(time._asdict()) for time in times]

+ 34
- 0
mindinsight/sysmetric/collector/_collect_mem.py View File

@@ -0,0 +1,34 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The memory collector."""

import psutil
from psutil._common import bytes2human


def collect_mem(readable=False):
"""
Collect the virtual memory info.

Args:
readable (bool): Read the sizes like 1K, 234M, 2G etc.

Returns:
dict, the virtual memory info.
"""
mem = psutil.virtual_memory()._asdict()
if not readable:
return dict(mem)
return {k: v if k == 'percent' else bytes2human(v) for k, v in mem.items()}

+ 281
- 0
mindinsight/sysmetric/collector/_collect_npu.py View File

@@ -0,0 +1,281 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""The npu collector."""

import inspect
from ctypes import CDLL, Structure, byref, c_char, c_int, c_uint, c_ulong, c_ushort
from functools import lru_cache

from mindinsight.sysmetric.common.log import logger

try:
libsmi = CDLL('libdrvdsmi_host.so')
except OSError:
logger.info('Failed to load libdrvdsmi_host.so.')
libsmi = None


def libsmicall(*args, **kwargs):
if not libsmi:
logger.error('Trying to call the libdrvdsmi_host which is not loaded.')
raise ValueError('Trying to call the libdrvdsmi_host which is not loaded.')
fname = inspect.stack()[1].function
return getattr(libsmi, fname)(*args, **kwargs)


@lru_cache(maxsize=4)
def dsmi_get_device_count():
"""
Get device count.

Returns:
int, the device count.
"""
device_count = c_int()

libsmicall(byref(device_count))

return device_count.value


@lru_cache(maxsize=4)
def dsmi_list_device(count):
"""
List the device IDs.

Args:
count (int): The device count.

Returns:
List[int], the device IDs.
"""
device_id_array = c_int * count
device_id_list = device_id_array()
count = c_int(count)

libsmicall(device_id_list, count)

return list(device_id_list)


@lru_cache(maxsize=8)
def dsmi_get_chip_info(device_id):
"""
Get chip info.

Args:
device_id (int): The specific device id.

Returns:
dict, the chip info:
- chip_type (str): The chip type.
- chip_name (str): The chip name.
- chip_ver (str): The chip name.
"""

class ChipInfoStruct(Structure):
_fields_ = [('chip_type', c_char * 32), ('chip_name', c_char * 32), ('chip_ver', c_char * 32)]

device_id = c_int(device_id)
chip_info = ChipInfoStruct()
libsmicall(device_id, byref(chip_info))
return {
'chip_type': chip_info.chip_type.decode('utf-8'),
'chip_name': chip_info.chip_name.decode('utf-8'),
'chip_ver': chip_info.chip_ver.decode('utf-8')
}


def dsmi_get_device_health(device_id):
"""
Get device health.

Args:
device_id (int): The specific device id.

Returns:
int, 0 indicats normal, 1 minor alarm, 2 major alarm, 3 critical alarm, 0xffffffff device not found.
"""
device_id = c_int(device_id)
health = c_uint()

libsmicall(device_id, byref(health))

return health.value


@lru_cache(maxsize=8)
def dsmi_get_device_ip_address(device_id):
"""
Get device IP address.

Args:
device_id (int): The specific device ID.
Returns:
dict, the device IP address:
- ip_address (str): the IP address.
- mask_address (str): the mask address.
"""
is_ipv6, port_type, port_id = False, 1, 0

class Ipaddrstruct(Structure):
_fields_ = [('u_addr', c_char * (16 if is_ipv6 else 4)), ('ip_type', c_int)]

ip_type = c_int(1 if is_ipv6 else 0)

device_id = c_int(device_id)
ip_address = Ipaddrstruct(b'', ip_type)
mask_address = Ipaddrstruct(b'', ip_type)

libsmicall(device_id, port_type, port_id, byref(ip_address), byref(mask_address))

def pad(u_addr):
for i in range(4):
if i < len(u_addr):
yield u_addr[i]
else:
yield 0

return {
'ip_address': '.'.join(str(c) for c in pad(ip_address.u_addr)),
'mask_address': '.'.join(str(c) for c in pad(mask_address.u_addr))
}


def dsmi_get_hbm_info(device_id):
"""
Get the HBM info.

Args:
device_id (int): The specific device id.

Returns:
dict, the HBM info:
memory_size (int), The total HBM memory, in KB.
frep (int), The HBM frequency, in MHZ.
memory_usage (int), The used HBM memory, in KB.
temp (int), The HBM temperature, in °C.
bandwith_util_rate (int): The bandwith util rate, in %.
"""

class HbmInfoStruct(Structure):
_fields_ = [('memory_size', c_ulong), ('freq', c_uint), ('memory_usage', c_ulong), ('temp', c_int),
('bandwith_util_rate', c_uint)]

device_id = c_int(device_id)
hbm_info = HbmInfoStruct()

libsmicall(device_id, byref(hbm_info))

return {
'memory_size': hbm_info.memory_size,
'freq': hbm_info.freq,
'memory_usage': hbm_info.memory_usage,
'temp': hbm_info.temp,
'bandwith_util_rate': hbm_info.bandwith_util_rate
}


def dsmi_get_device_utilization_rate(device_id, device_type):
"""
Get device utilization rate, %.

Note: Query AI Core when profiling turns on will return failure.

Args:
device_id (int): The specific device id
device_type (int): The device type, 1 for memory, 2 AI Core, 5 memory bandwidth, 6 HBM, 10 HBM bandwidth.
Returns:
int, the utilization rate.
"""
device_id = c_int(device_id)
device_type = c_int(device_type)
utilization_rate = c_uint()

libsmicall(device_id, device_type, byref(utilization_rate))

return utilization_rate.value


def dsmi_get_device_power_info(device_id):
"""
Get the device power.

Args:
device_id (int): The specific device id.

Returns:
dict, the device power info.
- power, the device power, in Watt.
"""

class PowerInfoStruct(Structure):
_fields_ = [('power', c_ushort)]

power_info = PowerInfoStruct()
device_id = c_int(device_id)

libsmicall(device_id, byref(power_info))
return {'power': round(power_info.power * 0.1, 2)}


def dsmi_get_device_temperature(device_id):
"""
Get the device temperature.

Args:
device_id (int): The specific device id.

Returns:
int, the device temperature, in °C.
"""
device_id = c_int(device_id)
temperature = c_uint()

libsmicall(device_id, byref(temperature))

return temperature.value


def collect_npu():
"""Collect the metrics for each NPUs.

Returns:
List[dict], the metrics of each NPUs.
"""
if not libsmi:
return None
kb_to_mb, memory_threshold = 1024, 4
count = dsmi_get_device_count()
device_ids = dsmi_list_device(count)
npus = []
for device_id in device_ids:
health = dsmi_get_device_health(device_id)
hbm_info = dsmi_get_hbm_info(device_id)
npus.append({
'chip_name': dsmi_get_chip_info(device_id).get('chip_name'),
'device_id': device_id,
'available': health == 0 and hbm_info.get('memory_usage', 0) // kb_to_mb < memory_threshold,
'health': health,
'ip_address': dsmi_get_device_ip_address(device_id).get('ip_address'),
'aicore_rate': dsmi_get_device_utilization_rate(device_id, 2),
'hbm_info': {
'memory_size': hbm_info.get('memory_size') // kb_to_mb,
'memory_usage': hbm_info.get('memory_usage') // kb_to_mb
},
'power': dsmi_get_device_power_info(device_id).get('power'),
'temperature': dsmi_get_device_temperature(device_id)
})
return npus

+ 14
- 0
mindinsight/sysmetric/common/__init__.py View File

@@ -0,0 +1,14 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================

+ 18
- 0
mindinsight/sysmetric/common/log.py View File

@@ -0,0 +1,18 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Resource logger."""
from mindinsight.utils.log import setup_logger

logger = setup_logger(sub_module='sysmetric', log_name='sysmetric')

+ 15
- 0
tests/ut/sysmetric/__init__.py View File

@@ -0,0 +1,15 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test the system metrics."""

+ 42
- 0
tests/ut/sysmetric/metrics_collector.py View File

@@ -0,0 +1,42 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test the metrics collector."""
from os import cpu_count

from mindinsight.sysmetric.collector import collect_cpu, collect_mem, collect_npu


def test_collect_cpu():
overall = collect_cpu(percent=True)
assert isinstance(overall, dict)
for value in overall.values():
assert 0 <= value <= 100
for key in 'user', 'system', 'idle':
assert key in overall
cores = collect_cpu(percpu=True)
assert isinstance(cores, list) and len(cores) == cpu_count()


def test_collect_mem():
mem = collect_mem()
assert 'total' in mem
assert 'available' in mem
assert mem['total'] > mem['available']


def test_collect_npu():
npu = collect_npu()
if npu is not None:
assert len(npu) == 8

Loading…
Cancel
Save