Browse Source

添加利用命令获取gpu数目的函数

tags/v1.0.0alpha
x54-729 3 years ago
parent
commit
da849564d6
3 changed files with 27 additions and 9 deletions
  1. +3
    -2
      fastNLP/envs/__init__.py
  2. +11
    -7
      fastNLP/envs/set_backend.py
  3. +13
    -0
      fastNLP/envs/utils.py

+ 3
- 2
fastNLP/envs/__init__.py View File

@@ -6,7 +6,8 @@ __all__ = [
'is_cur_env_distributed',
'get_global_rank',
'rank_zero_call',
'all_rank_call'
'all_rank_call',
'get_gpu_count'
]


@@ -14,5 +15,5 @@ from .env import *
from .set_env_on_import import set_env_on_import
from .set_backend import dump_fastnlp_backend
from .imports import *
from .utils import _module_available
from .utils import _module_available, get_gpu_count
from .distributed import *

+ 11
- 7
fastNLP/envs/set_backend.py View File

@@ -5,13 +5,13 @@
import os
import json
import sys
import subprocess
from collections import defaultdict


from fastNLP.envs.env import FASTNLP_BACKEND, FASTNLP_DISTRIBUTED_CHECK, FASTNLP_GLOBAL_RANK, USER_CUDA_VISIBLE_DEVICES, FASTNLP_GLOBAL_SEED
from fastNLP.envs.env import FASTNLP_BACKEND, FASTNLP_GLOBAL_RANK, USER_CUDA_VISIBLE_DEVICES, FASTNLP_GLOBAL_SEED
from fastNLP.envs.imports import SUPPORT_BACKENDS
from fastNLP.envs.utils import _module_available

from fastNLP.envs.utils import _module_available, get_gpu_count

def _set_backend():
"""
@@ -56,15 +56,17 @@ def _set_backend():
if 'PADDLE_RANK_IN_NODE' in os.environ and 'FLAGS_selected_gpus' in os.environ:
# 在分布式子进程下,根据 USER_VISIBLE_DEVICES 得到进程真正占有的设备
selected_gpus = os.environ['FLAGS_selected_gpus'].split(',')
if user_visible_devices is not None and user_visible_devices != "":
if user_visible_devices is not None:
# 用户通过 CUDA_VISIBLE_DEVICES 启动了分布式训练
# 此时经过 set_backend,用户的设置会保存在 USER_CUDA_VISIBLE_DEVICES 中
# 我们需要从中找到真正使用的设备编号
user_visible_devices = user_visible_devices.split(",")
selected_gpus = ",".join([user_visible_devices[int(i)] for i in selected_gpus])
else:
# 设置 USER_CUDA_VISIBLE_DEVICES 表明用户视角中所有设备可见
os.environ[USER_CUDA_VISIBLE_DEVICES] = ""
# 没有找到 USER_CUDA_VISIBLE_DEVICES,则将之设置为所有的设备
os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list(
range(get_gpu_count())
)))
os.environ['CUDA_VISIBLE_DEVICES'] = ",".join(selected_gpus)
os.environ['FLAGS_selected_gpus'] = ",".join([str(g) for g in range(len(selected_gpus))])
os.environ['FLAGS_selected_accelerators'] = ",".join([str(g) for g in range(len(selected_gpus))])
@@ -77,7 +79,9 @@ def _set_backend():
else:
# 没有设置的话限制在单卡上,防止多进程时占用别的卡
os.environ['CUDA_VISIBLE_DEVICES'] = '0'
os.environ[USER_CUDA_VISIBLE_DEVICES] = ''
os.environ[USER_CUDA_VISIBLE_DEVICES] = ",".join(map(str, list(
range(get_gpu_count())
)))

elif backend == 'jittor':
assert _module_available(backend), f"You must have {backend} available to use {backend} backend."


+ 13
- 0
fastNLP/envs/utils.py View File

@@ -3,6 +3,7 @@ from typing import Callable
import importlib
from pkg_resources import DistributionNotFound
from packaging.version import Version
import subprocess
import pkg_resources


@@ -46,3 +47,15 @@ def _compare_version(package: str, op: Callable, version: str, use_base_version:
if use_base_version:
pkg_version = Version(pkg_version.base_version)
return op(pkg_version, Version(version))

def get_gpu_count():
"""
利用命令行获取gpu数目的函数
:return: gpu数目,如果没有显卡设备则为-1
"""
try:
lines = subprocess.check_output(['nvidia-smi', '--query-gpu=memory.used', '--format=csv'])
# 经分割后还要除去头部和尾部的换行符
return len(lines.split(b"\n")) - 2
except:
return -1

Loading…
Cancel
Save