Browse Source

fix several fixes

fix npu timeout mechanism

sum not used and not available to others

add new field of success
tags/v0.6.0-beta
Li Hongzhang 5 years ago
parent
commit
02217977b4
4 changed files with 177 additions and 50 deletions
  1. +7
    -8
      mindinsight/sysmetric/collector/__init__.py
  2. +140
    -42
      mindinsight/sysmetric/collector/_collect_npu.py
  3. +25
    -0
      mindinsight/sysmetric/common/exceptions.py
  4. +5
    -0
      mindinsight/utils/constant.py

+ 7
- 8
mindinsight/sysmetric/collector/__init__.py View File

@@ -13,20 +13,18 @@
# limitations under the License. # limitations under the License.
# ============================================================================ # ============================================================================
"""The metrics collector.""" """The metrics collector."""

from ._collect_cpu import collect_cpu from ._collect_cpu import collect_cpu
from ._collect_mem import collect_mem from ._collect_mem import collect_mem
from ._collect_npu import collect_npu from ._collect_npu import collect_npu


__all__ = [
'collect_cpu',
'collect_mem',
'collect_npu',
]
__all__ = ['collect_cpu', 'collect_mem', 'collect_npu', 'get_metrics']




def get_metrics(): def get_metrics():
mem = collect_mem() mem = collect_mem()
mem_total = mem.get('total')
mem_available = mem.get('available')
mem_used = mem.get('used')
return { return {
'npu': collect_npu(), 'npu': collect_npu(),
'cpu': { 'cpu': {
@@ -35,8 +33,9 @@ def get_metrics():
}, },
'memory': { 'memory': {
'virtual': { 'virtual': {
'available': mem.get('available'),
'used': mem.get('used')
'available': mem_available,
'used': mem_used,
'others': max(mem_total - mem_available - mem_used, 0)
} }
} }
} }

+ 140
- 42
mindinsight/sysmetric/collector/_collect_npu.py View File

@@ -20,6 +20,7 @@ from ctypes import CDLL, Structure, byref, c_char, c_int, c_uint, c_ulong, c_ush
from functools import lru_cache, wraps from functools import lru_cache, wraps
from threading import Lock, Thread from threading import Lock, Thread


from mindinsight.sysmetric.common.exceptions import DsmiQueryingException
from mindinsight.sysmetric.common.log import logger from mindinsight.sysmetric.common.log import logger




@@ -59,12 +60,39 @@ def _timeout(seconds, default):
return outer return outer




def libsmicall(*args, **kwargs):
def _fallback_to_prev_result(fn):
"""Fallback to previous successful result when failing."""
prev_result = None

@wraps(fn)
def wrap(*args):
nonlocal prev_result
sucess, result = fn(*args)
if sucess:
prev_result = result
return sucess, result
if prev_result is not None:
return sucess, prev_result
raise RuntimeError(f'{fn.__name__} querying failed and no previous successful result.')

return wrap


def _libsmicall(*args):
"""
Call the lib function to querying NPU metrics.

Returns:
bool, True when success of querying, False otherwise.
"""
if not libsmi: if not libsmi:
logger.error('Trying to call the libdrvdsmi_host which is not loaded.') logger.error('Trying to call the libdrvdsmi_host which is not loaded.')
raise ValueError('Trying to call the libdrvdsmi_host which is not loaded.') raise ValueError('Trying to call the libdrvdsmi_host which is not loaded.')
fname = inspect.stack()[1].function fname = inspect.stack()[1].function
return getattr(libsmi, fname)(*args, **kwargs)
error_code = getattr(libsmi, fname)(*args)
if error_code != 0:
logger.error(f'{fname} querying failed with error code {error_code}.')
return error_code == 0




@lru_cache(maxsize=4) @lru_cache(maxsize=4)
@@ -74,12 +102,15 @@ def dsmi_get_device_count():


Returns: Returns:
int, the device count. int, the device count.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """
device_count = c_int() device_count = c_int()


libsmicall(byref(device_count))
return device_count.value
if _libsmicall(byref(device_count)):
return device_count.value
raise RuntimeError('Querying device count failed.')




@lru_cache(maxsize=4) @lru_cache(maxsize=4)
@@ -92,17 +123,21 @@ def dsmi_list_device(count):


Returns: Returns:
List[int], the device IDs. List[int], the device IDs.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """
device_id_array = c_int * count device_id_array = c_int * count
device_id_list = device_id_array() device_id_list = device_id_array()
count = c_int(count) count = c_int(count)


libsmicall(device_id_list, count)
return list(device_id_list)
if _libsmicall(device_id_list, count):
return list(device_id_list)
raise RuntimeError('Querying device id list failed.')




@lru_cache(maxsize=8) @lru_cache(maxsize=8)
@_fallback_to_prev_result
def dsmi_get_chip_info(device_id): def dsmi_get_chip_info(device_id):
""" """
Get chip info. Get chip info.
@@ -115,6 +150,9 @@ def dsmi_get_chip_info(device_id):
- chip_type (str): The chip type. - chip_type (str): The chip type.
- chip_name (str): The chip name. - chip_name (str): The chip name.
- chip_ver (str): The chip name. - chip_ver (str): The chip name.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """


class ChipInfoStruct(Structure): class ChipInfoStruct(Structure):
@@ -122,14 +160,15 @@ def dsmi_get_chip_info(device_id):


device_id = c_int(device_id) device_id = c_int(device_id)
chip_info = ChipInfoStruct() chip_info = ChipInfoStruct()
libsmicall(device_id, byref(chip_info))
return {
success = _libsmicall(device_id, byref(chip_info))
return success, {
'chip_type': chip_info.chip_type.decode('utf-8'), 'chip_type': chip_info.chip_type.decode('utf-8'),
'chip_name': chip_info.chip_name.decode('utf-8'), 'chip_name': chip_info.chip_name.decode('utf-8'),
'chip_ver': chip_info.chip_ver.decode('utf-8') 'chip_ver': chip_info.chip_ver.decode('utf-8')
} }




@_fallback_to_prev_result
def dsmi_get_device_health(device_id): def dsmi_get_device_health(device_id):
""" """
Get device health. Get device health.
@@ -139,16 +178,20 @@ def dsmi_get_device_health(device_id):


Returns: Returns:
int, 0 indicats normal, 1 minor alarm, 2 major alarm, 3 critical alarm, 0xffffffff device not found. int, 0 indicats normal, 1 minor alarm, 2 major alarm, 3 critical alarm, 0xffffffff device not found.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """
device_id = c_int(device_id) device_id = c_int(device_id)
health = c_uint() health = c_uint()


libsmicall(device_id, byref(health))
success = _libsmicall(device_id, byref(health))


return health.value
return success, health.value




@lru_cache(maxsize=8) @lru_cache(maxsize=8)
@_fallback_to_prev_result
def dsmi_get_device_ip_address(device_id): def dsmi_get_device_ip_address(device_id):
""" """
Get device IP address. Get device IP address.
@@ -159,6 +202,9 @@ def dsmi_get_device_ip_address(device_id):
dict, the device IP address: dict, the device IP address:
- ip_address (str): the IP address. - ip_address (str): the IP address.
- mask_address (str): the mask address. - mask_address (str): the mask address.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """
is_ipv6, port_type, port_id = False, 1, 0 is_ipv6, port_type, port_id = False, 1, 0


@@ -171,7 +217,7 @@ def dsmi_get_device_ip_address(device_id):
ip_address = Ipaddrstruct(b'', ip_type) ip_address = Ipaddrstruct(b'', ip_type)
mask_address = Ipaddrstruct(b'', ip_type) mask_address = Ipaddrstruct(b'', ip_type)


libsmicall(device_id, port_type, port_id, byref(ip_address), byref(mask_address))
success = _libsmicall(device_id, port_type, port_id, byref(ip_address), byref(mask_address))


def pad(u_addr): def pad(u_addr):
for i in range(4): for i in range(4):
@@ -180,12 +226,13 @@ def dsmi_get_device_ip_address(device_id):
else: else:
yield 0 yield 0


return {
return success, {
'ip_address': '.'.join(str(c) for c in pad(ip_address.u_addr)), 'ip_address': '.'.join(str(c) for c in pad(ip_address.u_addr)),
'mask_address': '.'.join(str(c) for c in pad(mask_address.u_addr)) 'mask_address': '.'.join(str(c) for c in pad(mask_address.u_addr))
} }




@_fallback_to_prev_result
def dsmi_get_hbm_info(device_id): def dsmi_get_hbm_info(device_id):
""" """
Get the HBM info. Get the HBM info.
@@ -200,6 +247,9 @@ def dsmi_get_hbm_info(device_id):
memory_usage (int), The used HBM memory, in KB. memory_usage (int), The used HBM memory, in KB.
temp (int), The HBM temperature, in °C. temp (int), The HBM temperature, in °C.
bandwith_util_rate (int): The bandwith util rate, in %. bandwith_util_rate (int): The bandwith util rate, in %.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """


class HbmInfoStruct(Structure): class HbmInfoStruct(Structure):
@@ -209,9 +259,9 @@ def dsmi_get_hbm_info(device_id):
device_id = c_int(device_id) device_id = c_int(device_id)
hbm_info = HbmInfoStruct() hbm_info = HbmInfoStruct()


libsmicall(device_id, byref(hbm_info))
success = _libsmicall(device_id, byref(hbm_info))


return {
return success, {
'memory_size': hbm_info.memory_size, 'memory_size': hbm_info.memory_size,
'freq': hbm_info.freq, 'freq': hbm_info.freq,
'memory_usage': hbm_info.memory_usage, 'memory_usage': hbm_info.memory_usage,
@@ -221,6 +271,7 @@ def dsmi_get_hbm_info(device_id):




@_timeout(0.2, 0) @_timeout(0.2, 0)
@_fallback_to_prev_result
def dsmi_get_device_utilization_rate(device_id, device_type): def dsmi_get_device_utilization_rate(device_id, device_type):
""" """
Get device utilization rate, %. Get device utilization rate, %.
@@ -236,12 +287,11 @@ def dsmi_get_device_utilization_rate(device_id, device_type):
device_id = c_int(device_id) device_id = c_int(device_id)
device_type = c_int(device_type) device_type = c_int(device_type)
utilization_rate = c_uint() utilization_rate = c_uint()

libsmicall(device_id, device_type, byref(utilization_rate))

return utilization_rate.value
success = _libsmicall(device_id, device_type, byref(utilization_rate))
return success, utilization_rate.value




@_fallback_to_prev_result
def dsmi_get_device_power_info(device_id): def dsmi_get_device_power_info(device_id):
""" """
Get the device power. Get the device power.
@@ -252,6 +302,9 @@ def dsmi_get_device_power_info(device_id):
Returns: Returns:
dict, the device power info. dict, the device power info.
- power, the device power, in Watt. - power, the device power, in Watt.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """


class PowerInfoStruct(Structure): class PowerInfoStruct(Structure):
@@ -260,10 +313,11 @@ def dsmi_get_device_power_info(device_id):
power_info = PowerInfoStruct() power_info = PowerInfoStruct()
device_id = c_int(device_id) device_id = c_int(device_id)


libsmicall(device_id, byref(power_info))
return {'power': round(power_info.power * 0.1, 2)}
success = _libsmicall(device_id, byref(power_info))
return success, {'power': round(power_info.power * 0.1, 2)}




@_fallback_to_prev_result
def dsmi_get_device_temperature(device_id): def dsmi_get_device_temperature(device_id):
""" """
Get the device temperature. Get the device temperature.
@@ -273,13 +327,16 @@ def dsmi_get_device_temperature(device_id):


Returns: Returns:
int, the device temperature, in °C. int, the device temperature, in °C.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """
device_id = c_int(device_id) device_id = c_int(device_id)
temperature = c_uint() temperature = c_uint()


libsmicall(device_id, byref(temperature))
success = _libsmicall(device_id, byref(temperature))


return temperature.value
return success, temperature.value




def collect_npu(): def collect_npu():
@@ -287,36 +344,77 @@ def collect_npu():


Returns: Returns:
List[dict], the metrics of each NPUs. List[dict], the metrics of each NPUs.

Raises:
DsmiQueryingException, when querying dsmi returning non-zero.
"""
try:
return _collect_npus()
except RuntimeError as e:
logger.warning(e.args[0])
raise DsmiQueryingException(e.args[0])


def _collect_npus():
"""Collect the metrics for each NPUs.

Returns:
List[dict], the metrics of each NPUs.

Raises:
RuntimeError, when querying dsmi returning non-zero.
""" """
if not libsmi: if not libsmi:
return None return None
kb_to_mb, memory_threshold = 1024, 4
count = dsmi_get_device_count() count = dsmi_get_device_count()
device_ids = dsmi_list_device(count) device_ids = dsmi_list_device(count)
npus = [] npus = []
for device_id in device_ids: for device_id in device_ids:
health = dsmi_get_device_health(device_id)
hbm_info = dsmi_get_hbm_info(device_id)
npus.append({
'chip_name': dsmi_get_chip_info(device_id).get('chip_name'),
'device_id': device_id,
'available': health == 0 and hbm_info.get('memory_usage', 0) // kb_to_mb < memory_threshold,
'health': health,
'ip_address': dsmi_get_device_ip_address(device_id).get('ip_address'),
'aicore_rate': dsmi_get_device_utilization_rate(device_id, 2),
'hbm_info': {
'memory_size': hbm_info.get('memory_size') // kb_to_mb,
'memory_usage': hbm_info.get('memory_usage') // kb_to_mb
},
'power': dsmi_get_device_power_info(device_id).get('power'),
'temperature': dsmi_get_device_temperature(device_id)
})
npu = _collect_one(device_id)
npus.append(npu)
return npus return npus




def _collect_one(device_id):
"""
Collect NPU info by the device_id.

Args:
device_id (int): The specific device id.

Returns:
dict, the NPU info.

Raises:
RuntimeError, when querying dsmi returning non-zero.
"""
kb_to_mb, memory_threshold, success = 1024, 4, [True] * 7
success[0], health = dsmi_get_device_health(device_id)
success[1], hbm_info = dsmi_get_hbm_info(device_id)
success[2], chip_info = dsmi_get_chip_info(device_id)
success[3], ip_addr = dsmi_get_device_ip_address(device_id)
success[4], aicore_rate = dsmi_get_device_utilization_rate(device_id, 2)
success[5], power_info = dsmi_get_device_power_info(device_id)
success[6], temperature = dsmi_get_device_temperature(device_id)
return {
'chip_name': chip_info.get('chip_name'),
'device_id': device_id,
'available': all(success) and health == 0 and hbm_info.get('memory_usage', 0) // kb_to_mb < memory_threshold,
'health': health,
'ip_address': ip_addr.get('ip_address'),
'aicore_rate': aicore_rate,
'hbm_info': {
'memory_size': hbm_info.get('memory_size') // kb_to_mb,
'memory_usage': hbm_info.get('memory_usage') // kb_to_mb
},
'power': power_info.get('power'),
'temperature': temperature,
'success': all(success)
}


try: try:
libsmi = CDLL('libdrvdsmi_host.so') libsmi = CDLL('libdrvdsmi_host.so')
Thread(target=collect_npu).start()
except OSError: except OSError:
logger.info('Failed to load libdrvdsmi_host.so.') logger.info('Failed to load libdrvdsmi_host.so.')
libsmi = None libsmi = None

+ 25
- 0
mindinsight/sysmetric/common/exceptions.py View File

@@ -0,0 +1,25 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Define custom exception."""

from mindinsight.utils.exceptions import MindInsightException
from mindinsight.utils.constant import SysmetricErrors


class DsmiQueryingException(MindInsightException):
"""Dsmi Querying Failure"""

def __init__(self, message):
super(DsmiQueryingException, self).__init__(SysmetricErrors.DSMI_QUERYING_NONZERO, message)

+ 5
- 0
mindinsight/utils/constant.py View File

@@ -31,6 +31,7 @@ class MindInsightModules(Enum):
DATAVISUAL = 5 DATAVISUAL = 5
PROFILERMGR = 6 PROFILERMGR = 6
SCRIPTCONVERTER = 7 SCRIPTCONVERTER = 7
SYSMETRIC = 8




class GeneralErrors(Enum): class GeneralErrors(Enum):
@@ -79,3 +80,7 @@ class DataVisualErrors(Enum):


class ScriptConverterErrors(Enum): class ScriptConverterErrors(Enum):
"""Enum definition for mindconverter errors.""" """Enum definition for mindconverter errors."""

class SysmetricErrors(Enum):
"""Enum definition for sysmetric errors."""
DSMI_QUERYING_NONZERO = 1

Loading…
Cancel
Save