From a34a40dfae3b43462b87e9dd475c3ae5d5e200ca Mon Sep 17 00:00:00 2001 From: x54-729 <17307130121@fudan.edu.cn> Date: Mon, 9 May 2022 19:56:15 +0000 Subject: [PATCH] =?UTF-8?q?=E5=9C=A8=E5=88=86=E5=B8=83=E5=BC=8F=E8=AE=AD?= =?UTF-8?q?=E7=BB=83=E4=B8=AD=EF=BC=8C=E4=B8=BAUSER=5FCUDA=5FVISIBLE=5FDEV?= =?UTF-8?q?ICES=E4=B8=BANone=E7=9A=84=E6=83=85=E5=86=B5=E6=B7=BB=E5=8A=A0?= =?UTF-8?q?=E6=8F=90=E9=86=92?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/drivers/paddle_driver/fleet.py | 4 ++-- .../core/drivers/paddle_driver/initialize_paddle_driver.py | 6 +++++- 2 files changed, 7 insertions(+), 3 deletions(-) diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py index d09cacc1..59c1e0ae 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet.py +++ b/fastNLP/core/drivers/paddle_driver/fleet.py @@ -191,8 +191,8 @@ class PaddleFleetDriver(PaddleDriver): 不管是什么情况,`PaddleFleetDriver` 在 `setup` 函数的最后,都会将所有进程的 pid 主动记录下来,这样当一个进程出现 exception 后, driver 的 on_exception 函数就会被 trainer 调用,其会调用 os.kill 指令将其它进程 kill 掉; """ - # if USER_CUDA_VISIBLE_DEVICES not in os.environ: - # raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") + if USER_CUDA_VISIBLE_DEVICES not in os.environ: + raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") super(PaddleFleetDriver, self).__init__(model, fp16=fp16, **kwargs) # 如果不是通过 launch 启动,要求用户必须传入 parallel_device diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index aa1b2db5..54ede2d8 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -7,6 +7,7 @@ from .single_device import PaddleSingleDriver from .fleet import PaddleFleetDriver from fastNLP.envs.imports import _NEED_IMPORT_PADDLE +from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES from fastNLP.core.utils import is_in_paddle_launch_dist, get_paddle_gpu_str from fastNLP.core.log import logger @@ -30,8 +31,10 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ """ if driver != "paddle": raise ValueError("When initialize PaddleDriver, parameter `driver` must be 'paddle'.") - user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") + user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) if is_in_paddle_launch_dist(): + if user_visible_devices is None: + raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") if device is not None: logger.warning_once("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " "up your script. And we will directly get the local device via environment variables.") @@ -65,6 +68,7 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ device = [get_paddle_gpu_str(g) for g in device] elif device is not None and not isinstance(device, str): raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") + if isinstance(device, List): return PaddleFleetDriver(model, device, **kwargs) else: