Merge pull request !427 from LiHongzhang/hw_boardtags/v0.6.0-beta
| @@ -111,6 +111,7 @@ def create_app(): | |||||
| static_folder_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, 'ui', 'dist', 'static')) | static_folder_path = os.path.realpath(os.path.join(os.path.dirname(__file__), os.pardir, 'ui', 'dist', 'static')) | ||||
| app = Flask(__name__, static_url_path=static_url_path, static_folder=static_folder_path) | app = Flask(__name__, static_url_path=static_url_path, static_folder=static_folder_path) | ||||
| app.config['JSON_SORT_KEYS'] = False | |||||
| if settings.ENABLE_CORS: | if settings.ENABLE_CORS: | ||||
| CORS(app, supports_credentials=True) | CORS(app, supports_credentials=True) | ||||
| @@ -17,6 +17,7 @@ | |||||
| from mindinsight.backend.datavisual.static_resource_api import init_module as static_init_module | from mindinsight.backend.datavisual.static_resource_api import init_module as static_init_module | ||||
| from mindinsight.backend.datavisual.task_manager_api import init_module as task_init_module | from mindinsight.backend.datavisual.task_manager_api import init_module as task_init_module | ||||
| from mindinsight.backend.datavisual.train_visual_api import init_module as train_init_module | from mindinsight.backend.datavisual.train_visual_api import init_module as train_init_module | ||||
| from mindinsight.backend.datavisual.sysmetric_api import init_module as sysmetric_init_module | |||||
| def init_module(app): | def init_module(app): | ||||
| @@ -30,3 +31,4 @@ def init_module(app): | |||||
| static_init_module(app) | static_init_module(app) | ||||
| task_init_module(app) | task_init_module(app) | ||||
| train_init_module(app) | train_init_module(app) | ||||
| sysmetric_init_module(app) | |||||
| @@ -0,0 +1,39 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """System metrics API.""" | |||||
| from flask import Blueprint, jsonify | |||||
| from mindinsight.conf import settings | |||||
| from mindinsight.sysmetric.collector import get_metrics | |||||
| BLUEPRINT = Blueprint("sysmetric", __name__, url_prefix=settings.URL_PATH_PREFIX + settings.API_PREFIX) | |||||
| @BLUEPRINT.route("/sysmetric/current", methods=["GET"]) | |||||
| def query_sysmetric(): | |||||
| """Query the system metrics.""" | |||||
| return jsonify(get_metrics()) | |||||
| def init_module(app): | |||||
| """ | |||||
| Init module entry. | |||||
| Args: | |||||
| app: the application obj. | |||||
| """ | |||||
| app.register_blueprint(BLUEPRINT) | |||||
| @@ -0,0 +1,42 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """The metrics collector.""" | |||||
| from ._collect_cpu import collect_cpu | |||||
| from ._collect_mem import collect_mem | |||||
| from ._collect_npu import collect_npu | |||||
| __all__ = [ | |||||
| 'collect_cpu', | |||||
| 'collect_mem', | |||||
| 'collect_npu', | |||||
| ] | |||||
| def get_metrics(): | |||||
| mem = collect_mem() | |||||
| return { | |||||
| 'npu': collect_npu(), | |||||
| 'cpu': { | |||||
| 'overall': collect_cpu(percent=True), | |||||
| 'percpu': collect_cpu(percpu=True, percent=True) | |||||
| }, | |||||
| 'memory': { | |||||
| 'virtual': { | |||||
| 'available': mem.get('available'), | |||||
| 'used': mem.get('used') | |||||
| } | |||||
| } | |||||
| } | |||||
| @@ -0,0 +1,37 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """The cpu collector.""" | |||||
| import psutil | |||||
| def collect_cpu(percpu=False, percent=False): | |||||
| """ | |||||
| Collect the cpu info. | |||||
| Args: | |||||
| percpu (bool): To return a list of cpu info for each logical CPU on the system. | |||||
| percent (bool): Represent the sized in percentage. | |||||
| Returns: | |||||
| Union[dict, List[dict]], the CPUs info. | |||||
| """ | |||||
| if percent: | |||||
| times = psutil.cpu_times_percent(percpu=percpu) | |||||
| else: | |||||
| times = psutil.cpu_times(percpu=percpu) | |||||
| if not percpu: | |||||
| return dict(times._asdict()) | |||||
| return [dict(time._asdict()) for time in times] | |||||
| @@ -0,0 +1,34 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """The memory collector.""" | |||||
| import psutil | |||||
| from psutil._common import bytes2human | |||||
| def collect_mem(readable=False): | |||||
| """ | |||||
| Collect the virtual memory info. | |||||
| Args: | |||||
| readable (bool): Read the sizes like 1K, 234M, 2G etc. | |||||
| Returns: | |||||
| dict, the virtual memory info. | |||||
| """ | |||||
| mem = psutil.virtual_memory()._asdict() | |||||
| if not readable: | |||||
| return dict(mem) | |||||
| return {k: v if k == 'percent' else bytes2human(v) for k, v in mem.items()} | |||||
| @@ -0,0 +1,281 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """The npu collector.""" | |||||
| import inspect | |||||
| from ctypes import CDLL, Structure, byref, c_char, c_int, c_uint, c_ulong, c_ushort | |||||
| from functools import lru_cache | |||||
| from mindinsight.sysmetric.common.log import logger | |||||
| try: | |||||
| libsmi = CDLL('libdrvdsmi_host.so') | |||||
| except OSError: | |||||
| logger.info('Failed to load libdrvdsmi_host.so.') | |||||
| libsmi = None | |||||
| def libsmicall(*args, **kwargs): | |||||
| if not libsmi: | |||||
| logger.error('Trying to call the libdrvdsmi_host which is not loaded.') | |||||
| raise ValueError('Trying to call the libdrvdsmi_host which is not loaded.') | |||||
| fname = inspect.stack()[1].function | |||||
| return getattr(libsmi, fname)(*args, **kwargs) | |||||
| @lru_cache(maxsize=4) | |||||
| def dsmi_get_device_count(): | |||||
| """ | |||||
| Get device count. | |||||
| Returns: | |||||
| int, the device count. | |||||
| """ | |||||
| device_count = c_int() | |||||
| libsmicall(byref(device_count)) | |||||
| return device_count.value | |||||
| @lru_cache(maxsize=4) | |||||
| def dsmi_list_device(count): | |||||
| """ | |||||
| List the device IDs. | |||||
| Args: | |||||
| count (int): The device count. | |||||
| Returns: | |||||
| List[int], the device IDs. | |||||
| """ | |||||
| device_id_array = c_int * count | |||||
| device_id_list = device_id_array() | |||||
| count = c_int(count) | |||||
| libsmicall(device_id_list, count) | |||||
| return list(device_id_list) | |||||
| @lru_cache(maxsize=8) | |||||
| def dsmi_get_chip_info(device_id): | |||||
| """ | |||||
| Get chip info. | |||||
| Args: | |||||
| device_id (int): The specific device id. | |||||
| Returns: | |||||
| dict, the chip info: | |||||
| - chip_type (str): The chip type. | |||||
| - chip_name (str): The chip name. | |||||
| - chip_ver (str): The chip name. | |||||
| """ | |||||
| class ChipInfoStruct(Structure): | |||||
| _fields_ = [('chip_type', c_char * 32), ('chip_name', c_char * 32), ('chip_ver', c_char * 32)] | |||||
| device_id = c_int(device_id) | |||||
| chip_info = ChipInfoStruct() | |||||
| libsmicall(device_id, byref(chip_info)) | |||||
| return { | |||||
| 'chip_type': chip_info.chip_type.decode('utf-8'), | |||||
| 'chip_name': chip_info.chip_name.decode('utf-8'), | |||||
| 'chip_ver': chip_info.chip_ver.decode('utf-8') | |||||
| } | |||||
| def dsmi_get_device_health(device_id): | |||||
| """ | |||||
| Get device health. | |||||
| Args: | |||||
| device_id (int): The specific device id. | |||||
| Returns: | |||||
| int, 0 indicats normal, 1 minor alarm, 2 major alarm, 3 critical alarm, 0xffffffff device not found. | |||||
| """ | |||||
| device_id = c_int(device_id) | |||||
| health = c_uint() | |||||
| libsmicall(device_id, byref(health)) | |||||
| return health.value | |||||
| @lru_cache(maxsize=8) | |||||
| def dsmi_get_device_ip_address(device_id): | |||||
| """ | |||||
| Get device IP address. | |||||
| Args: | |||||
| device_id (int): The specific device ID. | |||||
| Returns: | |||||
| dict, the device IP address: | |||||
| - ip_address (str): the IP address. | |||||
| - mask_address (str): the mask address. | |||||
| """ | |||||
| is_ipv6, port_type, port_id = False, 1, 0 | |||||
| class Ipaddrstruct(Structure): | |||||
| _fields_ = [('u_addr', c_char * (16 if is_ipv6 else 4)), ('ip_type', c_int)] | |||||
| ip_type = c_int(1 if is_ipv6 else 0) | |||||
| device_id = c_int(device_id) | |||||
| ip_address = Ipaddrstruct(b'', ip_type) | |||||
| mask_address = Ipaddrstruct(b'', ip_type) | |||||
| libsmicall(device_id, port_type, port_id, byref(ip_address), byref(mask_address)) | |||||
| def pad(u_addr): | |||||
| for i in range(4): | |||||
| if i < len(u_addr): | |||||
| yield u_addr[i] | |||||
| else: | |||||
| yield 0 | |||||
| return { | |||||
| 'ip_address': '.'.join(str(c) for c in pad(ip_address.u_addr)), | |||||
| 'mask_address': '.'.join(str(c) for c in pad(mask_address.u_addr)) | |||||
| } | |||||
| def dsmi_get_hbm_info(device_id): | |||||
| """ | |||||
| Get the HBM info. | |||||
| Args: | |||||
| device_id (int): The specific device id. | |||||
| Returns: | |||||
| dict, the HBM info: | |||||
| memory_size (int), The total HBM memory, in KB. | |||||
| frep (int), The HBM frequency, in MHZ. | |||||
| memory_usage (int), The used HBM memory, in KB. | |||||
| temp (int), The HBM temperature, in °C. | |||||
| bandwith_util_rate (int): The bandwith util rate, in %. | |||||
| """ | |||||
| class HbmInfoStruct(Structure): | |||||
| _fields_ = [('memory_size', c_ulong), ('freq', c_uint), ('memory_usage', c_ulong), ('temp', c_int), | |||||
| ('bandwith_util_rate', c_uint)] | |||||
| device_id = c_int(device_id) | |||||
| hbm_info = HbmInfoStruct() | |||||
| libsmicall(device_id, byref(hbm_info)) | |||||
| return { | |||||
| 'memory_size': hbm_info.memory_size, | |||||
| 'freq': hbm_info.freq, | |||||
| 'memory_usage': hbm_info.memory_usage, | |||||
| 'temp': hbm_info.temp, | |||||
| 'bandwith_util_rate': hbm_info.bandwith_util_rate | |||||
| } | |||||
| def dsmi_get_device_utilization_rate(device_id, device_type): | |||||
| """ | |||||
| Get device utilization rate, %. | |||||
| Note: Query AI Core when profiling turns on will return failure. | |||||
| Args: | |||||
| device_id (int): The specific device id | |||||
| device_type (int): The device type, 1 for memory, 2 AI Core, 5 memory bandwidth, 6 HBM, 10 HBM bandwidth. | |||||
| Returns: | |||||
| int, the utilization rate. | |||||
| """ | |||||
| device_id = c_int(device_id) | |||||
| device_type = c_int(device_type) | |||||
| utilization_rate = c_uint() | |||||
| libsmicall(device_id, device_type, byref(utilization_rate)) | |||||
| return utilization_rate.value | |||||
| def dsmi_get_device_power_info(device_id): | |||||
| """ | |||||
| Get the device power. | |||||
| Args: | |||||
| device_id (int): The specific device id. | |||||
| Returns: | |||||
| dict, the device power info. | |||||
| - power, the device power, in Watt. | |||||
| """ | |||||
| class PowerInfoStruct(Structure): | |||||
| _fields_ = [('power', c_ushort)] | |||||
| power_info = PowerInfoStruct() | |||||
| device_id = c_int(device_id) | |||||
| libsmicall(device_id, byref(power_info)) | |||||
| return {'power': round(power_info.power * 0.1, 2)} | |||||
| def dsmi_get_device_temperature(device_id): | |||||
| """ | |||||
| Get the device temperature. | |||||
| Args: | |||||
| device_id (int): The specific device id. | |||||
| Returns: | |||||
| int, the device temperature, in °C. | |||||
| """ | |||||
| device_id = c_int(device_id) | |||||
| temperature = c_uint() | |||||
| libsmicall(device_id, byref(temperature)) | |||||
| return temperature.value | |||||
| def collect_npu(): | |||||
| """Collect the metrics for each NPUs. | |||||
| Returns: | |||||
| List[dict], the metrics of each NPUs. | |||||
| """ | |||||
| if not libsmi: | |||||
| return None | |||||
| kb_to_mb, memory_threshold = 1024, 4 | |||||
| count = dsmi_get_device_count() | |||||
| device_ids = dsmi_list_device(count) | |||||
| npus = [] | |||||
| for device_id in device_ids: | |||||
| health = dsmi_get_device_health(device_id) | |||||
| hbm_info = dsmi_get_hbm_info(device_id) | |||||
| npus.append({ | |||||
| 'chip_name': dsmi_get_chip_info(device_id).get('chip_name'), | |||||
| 'device_id': device_id, | |||||
| 'available': health == 0 and hbm_info.get('memory_usage', 0) // kb_to_mb < memory_threshold, | |||||
| 'health': health, | |||||
| 'ip_address': dsmi_get_device_ip_address(device_id).get('ip_address'), | |||||
| 'aicore_rate': dsmi_get_device_utilization_rate(device_id, 2), | |||||
| 'hbm_info': { | |||||
| 'memory_size': hbm_info.get('memory_size') // kb_to_mb, | |||||
| 'memory_usage': hbm_info.get('memory_usage') // kb_to_mb | |||||
| }, | |||||
| 'power': dsmi_get_device_power_info(device_id).get('power'), | |||||
| 'temperature': dsmi_get_device_temperature(device_id) | |||||
| }) | |||||
| return npus | |||||
| @@ -0,0 +1,14 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| @@ -0,0 +1,18 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Resource logger.""" | |||||
| from mindinsight.utils.log import setup_logger | |||||
| logger = setup_logger(sub_module='sysmetric', log_name='sysmetric') | |||||
| @@ -0,0 +1,15 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Test the system metrics.""" | |||||
| @@ -0,0 +1,42 @@ | |||||
| # Copyright 2020 Huawei Technologies Co., Ltd | |||||
| # | |||||
| # Licensed under the Apache License, Version 2.0 (the "License"); | |||||
| # you may not use this file except in compliance with the License. | |||||
| # You may obtain a copy of the License at | |||||
| # | |||||
| # http://www.apache.org/licenses/LICENSE-2.0 | |||||
| # | |||||
| # Unless required by applicable law or agreed to in writing, software | |||||
| # distributed under the License is distributed on an "AS IS" BASIS, | |||||
| # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. | |||||
| # See the License for the specific language governing permissions and | |||||
| # limitations under the License. | |||||
| # ============================================================================ | |||||
| """Test the metrics collector.""" | |||||
| from os import cpu_count | |||||
| from mindinsight.sysmetric.collector import collect_cpu, collect_mem, collect_npu | |||||
| def test_collect_cpu(): | |||||
| overall = collect_cpu(percent=True) | |||||
| assert isinstance(overall, dict) | |||||
| for value in overall.values(): | |||||
| assert 0 <= value <= 100 | |||||
| for key in 'user', 'system', 'idle': | |||||
| assert key in overall | |||||
| cores = collect_cpu(percpu=True) | |||||
| assert isinstance(cores, list) and len(cores) == cpu_count() | |||||
| def test_collect_mem(): | |||||
| mem = collect_mem() | |||||
| assert 'total' in mem | |||||
| assert 'available' in mem | |||||
| assert mem['total'] > mem['available'] | |||||
| def test_collect_npu(): | |||||
| npu = collect_npu() | |||||
| if npu is not None: | |||||
| assert len(npu) == 8 | |||||