添加利用命令获取gpu数目的函数

3 years ago · da849564d6
--- a/fastNLP/envs/init.py
+++ b/fastNLP/envs/init.py
@@ -6,7 +6,8 @@ __all__ = [
    'is_cur_env_distributed',
    'get_global_rank',
    'rank_zero_call',
    'all_rank_call'
    'all_rank_call',
    'get_gpu_count'
 ]
@@ -14,5 +15,5 @@ from .env import *
 from .set_env_on_import import set_env_on_import
 from .set_backend import dump_fastnlp_backend
 from .imports import *
 from .utils import _module_available
 from .utils import _module_available, get_gpu_count
 from .distributed import *
--- a/fastNLP/envs/set_backend.py
+++ b/fastNLP/envs/set_backend.py
@@ -5,13 +5,13 @@
 import os
 import json
 import sys
 import subprocess
 from collections import defaultdict
 from fastNLP.envs.env import FASTNLP_BACKEND, FASTNLP_DISTRIBUTED_CHECK, FASTNLP_GLOBAL_RANK, USER_CUDA_VISIBLE_DEVICES, FASTNLP_GLOBAL_SEED
 from fastNLP.envs.env import FASTNLP_BACKEND, FASTNLP_GLOBAL_RANK, USER_CUDA_VISIBLE_DEVICES, FASTNLP_GLOBAL_SEED
 from fastNLP.envs.imports import SUPPORT_BACKENDS
 from fastNLP.envs.utils import _module_available
 from fastNLP.envs.utils import _module_available, get_gpu_count
 def _set_backend():
    """
@@ -56,15 +56,17 @@ def _set_backend():
        if 'PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ:
            # 在分布式子进程下，根据 USER_VISIBLE_DEVICES 得到进程真正占有的设备
            selected_gpus = os.environ['FLAGS_selected_gpus'].split(',')
            if user_visible_devices is not None and user_visible_devices != "":
            if user_visible_devices is not None:
                # 用户通过 CUDA_VISIBLE_DEVICES 启动了分布式训练
                # 此时经过 set_backend，用户的设置会保存在 USER_CUDA_VISIBLE_DEVICES 中
                # 我们需要从中找到真正使用的设备编号
                user_visible_devices = user_visible_devices.split(",")
                selected_gpus = ",".join([user_visible_devices[int(i)] for i in selected_gpus])
            else:
                # 设置 USER_CUDA_VISIBLE_DEVICES 表明用户视角中所有设备可见
                os.environ[USER_CUDA_VISIBLE_DEVICES] = ""
                # 没有找到 USER_CUDA_VISIBLE_DEVICES，则将之设置为所有的设备
                os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list(
                    range(get_gpu_count())
                )))
            os.environ['CUDA_VISIBLE_DEVICES'] = ",".join(selected_gpus)
            os.environ['FLAGS_selected_gpus'] = ",".join([str(g) for g in range(len(selected_gpus))])
            os.environ['FLAGS_selected_accelerators'] = ",".join([str(g) for g in range(len(selected_gpus))])
@@ -77,7 +79,9 @@ def _set_backend():
        else:
            # 没有设置的话限制在单卡上，防止多进程时占用别的卡
            os.environ['CUDA_VISIBLE_DEVICES'] = '0'
            os.environ[USER_CUDA_VISIBLE_DEVICES] = ''
            os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list(
                range(get_gpu_count())
            )))
    elif backend == 'jittor':
        assert _module_available(backend), f"You must have {backend} available to use {backend} backend."
--- a/fastNLP/envs/utils.py
+++ b/fastNLP/envs/utils.py
@@ -3,6 +3,7 @@ from typing import Callable
 import importlib
 from pkg_resources import DistributionNotFound
 from packaging.version import Version
 import subprocess
 import pkg_resources
@@ -46,3 +47,15 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
    if use_base_version:
        pkg_version = Version(pkg_version.base_version)
    return op(pkg_version, Version(version))
 def get_gpu_count():
    """
    利用命令行获取gpu数目的函数
    :return: gpu数目，如果没有显卡设备则为-1
    """
    try:
        lines = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv'])
        # 经分割后还要除去头部和尾部的换行符
        return len(lines.split(b"\n")) - 2
    except:
        return -1