diff --git a/fastNLP/envs/__init__.py b/fastNLP/envs/__init__.py index 524572b3..4ae30677 100644 --- a/fastNLP/envs/__init__.py +++ b/fastNLP/envs/__init__.py @@ -6,7 +6,8 @@ __all__ = [ 'is_cur_env_distributed', 'get_global_rank', 'rank_zero_call', - 'all_rank_call' + 'all_rank_call', + 'get_gpu_count' ] @@ -14,5 +15,5 @@ from .env import * from .set_env_on_import import set_env_on_import from .set_backend import dump_fastnlp_backend from .imports import * -from .utils import _module_available +from .utils import _module_available, get_gpu_count from .distributed import * diff --git a/fastNLP/envs/set_backend.py b/fastNLP/envs/set_backend.py index 18cc970e..a9e82c74 100644 --- a/fastNLP/envs/set_backend.py +++ b/fastNLP/envs/set_backend.py @@ -5,13 +5,13 @@ import os import json import sys +import subprocess from collections import defaultdict -from fastNLP.envs.env import FASTNLP_BACKEND, FASTNLP_DISTRIBUTED_CHECK, FASTNLP_GLOBAL_RANK, USER_CUDA_VISIBLE_DEVICES, FASTNLP_GLOBAL_SEED +from fastNLP.envs.env import FASTNLP_BACKEND, FASTNLP_GLOBAL_RANK, USER_CUDA_VISIBLE_DEVICES, FASTNLP_GLOBAL_SEED from fastNLP.envs.imports import SUPPORT_BACKENDS -from fastNLP.envs.utils import _module_available - +from fastNLP.envs.utils import _module_available, get_gpu_count def _set_backend(): """ @@ -56,15 +56,17 @@ def _set_backend(): if 'PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ: # 在分布式子进程下,根据 USER_VISIBLE_DEVICES 得到进程真正占有的设备 selected_gpus = os.environ['FLAGS_selected_gpus'].split(',') - if user_visible_devices is not None and user_visible_devices != "": + if user_visible_devices is not None: # 用户通过 CUDA_VISIBLE_DEVICES 启动了分布式训练 # 此时经过 set_backend,用户的设置会保存在 USER_CUDA_VISIBLE_DEVICES 中 # 我们需要从中找到真正使用的设备编号 user_visible_devices = user_visible_devices.split(",") selected_gpus = ",".join([user_visible_devices[int(i)] for i in selected_gpus]) else: - # 设置 USER_CUDA_VISIBLE_DEVICES 表明用户视角中所有设备可见 - os.environ[USER_CUDA_VISIBLE_DEVICES] = "" + # 没有找到 USER_CUDA_VISIBLE_DEVICES,则将之设置为所有的设备 + os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list( + range(get_gpu_count()) + ))) os.environ['CUDA_VISIBLE_DEVICES'] = ",".join(selected_gpus) os.environ['FLAGS_selected_gpus'] = ",".join([str(g) for g in range(len(selected_gpus))]) os.environ['FLAGS_selected_accelerators'] = ",".join([str(g) for g in range(len(selected_gpus))]) @@ -77,7 +79,9 @@ def _set_backend(): else: # 没有设置的话限制在单卡上,防止多进程时占用别的卡 os.environ['CUDA_VISIBLE_DEVICES'] = '0' - os.environ[USER_CUDA_VISIBLE_DEVICES] = '' + os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list( + range(get_gpu_count()) + ))) elif backend == 'jittor': assert _module_available(backend), f"You must have {backend} available to use {backend} backend." diff --git a/fastNLP/envs/utils.py b/fastNLP/envs/utils.py index b06ba615..355c2448 100644 --- a/fastNLP/envs/utils.py +++ b/fastNLP/envs/utils.py @@ -3,6 +3,7 @@ from typing import Callable import importlib from pkg_resources import DistributionNotFound from packaging.version import Version +import subprocess import pkg_resources @@ -46,3 +47,15 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version: if use_base_version: pkg_version = Version(pkg_version.base_version) return op(pkg_version, Version(version)) + +def get_gpu_count(): + """ + 利用命令行获取gpu数目的函数 + :return: gpu数目,如果没有显卡设备则为-1 + """ + try: + lines = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv']) + # 经分割后还要除去头部和尾部的换行符 + return len(lines.split(b"\n")) - 2 + except: + return -1 \ No newline at end of file