Browse Source

在分布式训练中,为USER_CUDA_VISIBLE_DEVICES为None的情况添加提醒

tags/v1.0.0alpha
x54-729 3 years ago
parent
commit
a34a40dfae
2 changed files with 7 additions and 3 deletions
  1. +2
    -2
      fastNLP/core/drivers/paddle_driver/fleet.py
  2. +5
    -1
      fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py

+ 2
- 2
fastNLP/core/drivers/paddle_driver/fleet.py View File

@@ -191,8 +191,8 @@ class PaddleFleetDriver(PaddleDriver):
不管是什么情况,`PaddleFleetDriver` 在 `setup` 函数的最后,都会将所有进程的 pid 主动记录下来,这样当一个进程出现 exception 后,
driver 的 on_exception 函数就会被 trainer 调用,其会调用 os.kill 指令将其它进程 kill 掉;
"""
# if USER_CUDA_VISIBLE_DEVICES not in os.environ:
# raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
if USER_CUDA_VISIBLE_DEVICES not in os.environ:
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
super(PaddleFleetDriver, self).__init__(model, fp16=fp16, **kwargs)

# 如果不是通过 launch 启动,要求用户必须传入 parallel_device


+ 5
- 1
fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py View File

@@ -7,6 +7,7 @@ from .single_device import PaddleSingleDriver
from .fleet import PaddleFleetDriver

from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES
from fastNLP.core.utils import is_in_paddle_launch_dist, get_paddle_gpu_str
from fastNLP.core.log import logger

@@ -30,8 +31,10 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[
"""
if driver != "paddle":
raise ValueError("When initialize PaddleDriver, parameter `driver` must be 'paddle'.")
user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES")
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if is_in_paddle_launch_dist():
if user_visible_devices is None:
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
if device is not None:
logger.warning_once("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull "
"up your script. And we will directly get the local device via environment variables.")
@@ -65,6 +68,7 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[
device = [get_paddle_gpu_str(g) for g in device]
elif device is not None and not isinstance(device, str):
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.")
if isinstance(device, List):
return PaddleFleetDriver(model, device, **kwargs)
else:


Loading…
Cancel
Save