diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py index d09cacc1..59c1e0ae 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet.py +++ b/fastNLP/core/drivers/paddle_driver/fleet.py @@ -191,8 +191,8 @@ class PaddleFleetDriver(PaddleDriver): 不管是什么情况,`PaddleFleetDriver` 在 `setup` 函数的最后,都会将所有进程的 pid 主动记录下来,这样当一个进程出现 exception 后, driver 的 on_exception 函数就会被 trainer 调用,其会调用 os.kill 指令将其它进程 kill 掉; """ - # if USER_CUDA_VISIBLE_DEVICES not in os.environ: - # raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") + if USER_CUDA_VISIBLE_DEVICES not in os.environ: + raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") super(PaddleFleetDriver, self).__init__(model, fp16=fp16, **kwargs) # 如果不是通过 launch 启动,要求用户必须传入 parallel_device diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index aa1b2db5..54ede2d8 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -7,6 +7,7 @@ from .single_device import PaddleSingleDriver from .fleet import PaddleFleetDriver from fastNLP.envs.imports import _NEED_IMPORT_PADDLE +from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES from fastNLP.core.utils import is_in_paddle_launch_dist, get_paddle_gpu_str from fastNLP.core.log import logger @@ -30,8 +31,10 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ """ if driver != "paddle": raise ValueError("When initialize PaddleDriver, parameter `driver` must be 'paddle'.") - user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") + user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) if is_in_paddle_launch_dist(): + if user_visible_devices is None: + raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") if device is not None: logger.warning_once("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " "up your script. And we will directly get the local device via environment variables.") @@ -65,6 +68,7 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ device = [get_paddle_gpu_str(g) for g in device] elif device is not None and not isinstance(device, str): raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") + if isinstance(device, List): return PaddleFleetDriver(model, device, **kwargs) else: