diff --git a/fastNLP/core/drivers/jittor_driver/jittor_driver.py b/fastNLP/core/drivers/jittor_driver/jittor_driver.py index 542b39f9..ebcd7bfd 100644 --- a/fastNLP/core/drivers/jittor_driver/jittor_driver.py +++ b/fastNLP/core/drivers/jittor_driver/jittor_driver.py @@ -55,7 +55,7 @@ class JittorDriver(Driver): :param fp16: 是否开启混合精度训练; :param jittor_kwargs: """ - def __init__(self, model, fp16: bool = False, jittor_kwargs: Dict = {}, **kwargs): + def __init__(self, model, fp16: bool = False, jittor_kwargs: Dict = None, **kwargs): if not isinstance(model, Module): raise ValueError(f"Parameter `model` can not be `{type(model)}` in `JittorDriver`, it should be exactly " f"`jittor.Module` type.") @@ -67,7 +67,7 @@ class JittorDriver(Driver): jt.flags.auto_mixed_precision_level = 0 self.fp16 = fp16 self._auto_cast = nullcontext - self._jittor_kwargs = jittor_kwargs + self._jittor_kwargs = jittor_kwargs if jittor_kwargs is not None else {} # 用来设置是否关闭 auto_param_call 中的参数匹配问题; self.wo_auto_param_call = kwargs.get("model_wo_auto_param_call", False) diff --git a/fastNLP/core/drivers/jittor_driver/mpi.py b/fastNLP/core/drivers/jittor_driver/mpi.py index 47e9279b..2e3d42c2 100644 --- a/fastNLP/core/drivers/jittor_driver/mpi.py +++ b/fastNLP/core/drivers/jittor_driver/mpi.py @@ -34,7 +34,7 @@ class JittorMPIDriver(JittorDriver): parallel_device: None, is_pull_by_jittor_run: bool = False, fp16: bool = False, - jittor_kwargs: Dict = {}, + jittor_kwargs: Dict = None, **kwargs ): diff --git a/fastNLP/core/drivers/jittor_driver/single_device.py b/fastNLP/core/drivers/jittor_driver/single_device.py index be8ef1b9..eda11660 100644 --- a/fastNLP/core/drivers/jittor_driver/single_device.py +++ b/fastNLP/core/drivers/jittor_driver/single_device.py @@ -37,7 +37,7 @@ class JittorSingleDriver(JittorDriver): :param jittor_kwargs: """ - def __init__(self, model, device=None, fp16: bool = False, jittor_kwargs: Dict = {}, **kwargs): + def __init__(self, model, device=None, fp16: bool = False, jittor_kwargs: Dict = None, **kwargs): if device not in [None, "cpu", "gpu", "cuda"]: raise RuntimeError("Parameter `device` should be one of [None, 'cpu', 'gpu', 'cuda'] .") super(JittorSingleDriver, self).__init__(model, fp16, jittor_kwargs=jittor_kwargs) diff --git a/fastNLP/core/drivers/oneflow_driver/ddp.py b/fastNLP/core/drivers/oneflow_driver/ddp.py index fb992bc8..4a285856 100644 --- a/fastNLP/core/drivers/oneflow_driver/ddp.py +++ b/fastNLP/core/drivers/oneflow_driver/ddp.py @@ -46,7 +46,7 @@ class OneflowDDPDriver(OneflowDriver): 任何当前有多少台机器的信息; :param model: 传入给 ``Trainer`` 的 ``model`` 参数; - :param parallel_device: 该参数无效,**FastNLP** 会自动获取当前进程的设备; + :param parallel_device: 该参数无效,**fastNLP** 会自动获取当前进程的设备; :param fp16: 是否开启 fp16 训练;目前该参数无效; :param oneflow_kwargs: * *ddp_kwargs* -- 用于 ``DistributedDataParallel`` 的其它参数,详情可查阅 **oneflow** 的官方文档; @@ -57,7 +57,7 @@ class OneflowDDPDriver(OneflowDriver): model, parallel_device: Optional["oneflow.device"], fp16: bool = False, - oneflow_kwargs: Dict = {}, + oneflow_kwargs: Dict = None, **kwargs ): diff --git a/fastNLP/core/drivers/oneflow_driver/oneflow_driver.py b/fastNLP/core/drivers/oneflow_driver/oneflow_driver.py index 17777358..29027738 100644 --- a/fastNLP/core/drivers/oneflow_driver/oneflow_driver.py +++ b/fastNLP/core/drivers/oneflow_driver/oneflow_driver.py @@ -48,11 +48,11 @@ class OneflowDriver(Driver): 您可以在使用 ``OneflowSingleDriver`` 和 ``OneflowDDPDriver`` 时使用 ``OneflowDriver`` 提供的接口; """ - def __init__(self, model, fp16: Optional[bool] = False, oneflow_kwargs: Dict = {}, **kwargs): + def __init__(self, model, fp16: Optional[bool] = False, oneflow_kwargs: Dict = None, **kwargs): super(OneflowDriver, self).__init__(model) """ 进行 fp16 的设置 """ - self._oneflow_kwargs = oneflow_kwargs + self._oneflow_kwargs = oneflow_kwargs if oneflow_kwargs is not None else {} self.fp16 = fp16 if fp16: diff --git a/fastNLP/core/drivers/oneflow_driver/single_device.py b/fastNLP/core/drivers/oneflow_driver/single_device.py index aec4d0e1..84d77d14 100644 --- a/fastNLP/core/drivers/oneflow_driver/single_device.py +++ b/fastNLP/core/drivers/oneflow_driver/single_device.py @@ -29,14 +29,14 @@ class OneflowSingleDriver(OneflowDriver): :param oneflow_kwargs: """ - def __init__(self, model, device: "oneflow.device", fp16: bool = False, oneflow_kwargs: Dict = {}, **kwargs): + def __init__(self, model, device: "oneflow.device", fp16: bool = False, oneflow_kwargs: Dict = None, **kwargs): cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) if cuda_visible_devices == "": device = oneflow.device("cpu") logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to" "use `cpu` instead of `gpu` device.") - super(OneflowSingleDriver, self).__init__(model, fp16=fp16, **kwargs) + super(OneflowSingleDriver, self).__init__(model, fp16=fp16, oneflow_kwargs=oneflow_kwargs, **kwargs) if device is None: logger.debug("device is not set, fastNLP will try to automatically get it.") diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py index 6668d577..137aa9db 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet.py +++ b/fastNLP/core/drivers/paddle_driver/fleet.py @@ -152,12 +152,12 @@ class PaddleFleetDriver(PaddleDriver): parallel_device: Optional[Union[List[str], str]], is_pull_by_paddle_run: bool = False, fp16: bool = False, - paddle_kwrags: Dict = {}, + paddle_kwargs: Dict = None, **kwargs ): if USER_CUDA_VISIBLE_DEVICES not in os.environ: - raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") - super(PaddleFleetDriver, self).__init__(model, fp16=fp16, paddle_kwrags=paddle_kwargs, **kwargs) + raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using fastNLP.") + super(PaddleFleetDriver, self).__init__(model, fp16=fp16, paddle_kwargs=paddle_kwargs, **kwargs) # 如果不是通过 launch 启动,要求用户必须传入 parallel_device if not is_pull_by_paddle_run: @@ -195,17 +195,14 @@ class PaddleFleetDriver(PaddleDriver): self.world_size = None self.global_rank = 0 self.gloo_rendezvous_dir = None - - # 分布式环境的其它参数设置 - paddle_kwargs = kwargs.get("paddle_kwargs", {}) - self._fleet_kwargs = paddle_kwargs.get("fleet_kwargs", {}) + self._fleet_kwargs = self._paddle_kwargs.get("fleet_kwargs", {}) check_user_specific_params(self._fleet_kwargs, DataParallel.__init__, DataParallel.__name__) # fleet.init 中对于分布式策略的设置,详情可以参考 PaddlePaddle 的官方文档 self.strategy = self._fleet_kwargs.get("strategy", fleet.DistributedStrategy()) self.is_collective = self._fleet_kwargs.pop("is_collective", True) if not self.is_collective: - raise NotImplementedError("FastNLP only support `collective` for distributed training now.") + raise NotImplementedError("fastNLP only support `collective` for distributed training now.") self.role_maker = self._fleet_kwargs.pop("role_maker", None) self.output_from_new_proc = kwargs.get("output_from_new_proc", "only_error") diff --git a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py index 552fc622..e059e91c 100644 --- a/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py @@ -38,7 +38,7 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[ user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) if is_in_paddle_launch_dist(): if user_visible_devices is None: - raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.") + raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using fastNLP.") if device is not None: logger.rank_zero_warning("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " "up your script. And we will directly get the local device via environment variables.", once=True) diff --git a/fastNLP/core/drivers/paddle_driver/paddle_driver.py b/fastNLP/core/drivers/paddle_driver/paddle_driver.py index cccd87c8..0ba0dc1b 100644 --- a/fastNLP/core/drivers/paddle_driver/paddle_driver.py +++ b/fastNLP/core/drivers/paddle_driver/paddle_driver.py @@ -70,13 +70,14 @@ class PaddleDriver(Driver): :param paddle_kwargs: """ - def __init__(self, model: "paddle.nn.Layer", fp16: Optional[bool] = False, paddle_kwrags: Dict = {}, **kwargs): + def __init__(self, model: "paddle.nn.Layer", fp16: Optional[bool] = False, paddle_kwargs: Dict = None, **kwargs): if not isinstance(model, paddle.nn.Layer): raise ValueError(f"Parameter `model` can not be `{type(model)}` in `PaddleDriver`, it should be exactly " f"`paddle.nn.Layer` type.") super(PaddleDriver, self).__init__(model) self.fp16 = fp16 + self._paddle_kwargs = paddle_kwargs if paddle_kwargs is not None else {} # scaler的参数 self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16) diff --git a/fastNLP/core/drivers/paddle_driver/single_device.py b/fastNLP/core/drivers/paddle_driver/single_device.py index 267c10bd..86994b79 100644 --- a/fastNLP/core/drivers/paddle_driver/single_device.py +++ b/fastNLP/core/drivers/paddle_driver/single_device.py @@ -53,7 +53,7 @@ class PaddleSingleDriver(PaddleDriver): 关于该参数的详细说明,请参见 :class:`~fastNLP.core.controllers.Trainer` 中的描述;函数 ``auto_param_call`` 详见 :func:`fastNLP.core.utils.auto_param_call`。 """ - def __init__(self, model: "paddle.nn.Layer", device: Union[str, int], fp16: Optional[bool] = False, paddle_kwrags: Dict = {}, **kwargs): + def __init__(self, model: "paddle.nn.Layer", device: Union[str, int], fp16: Optional[bool] = False, paddle_kwargs: Dict = None, **kwargs): if isinstance(model, DataParallel): raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`") @@ -63,7 +63,7 @@ class PaddleSingleDriver(PaddleDriver): logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to" "use `cpu` instead of `gpu` device.") - super(PaddleSingleDriver, self).__init__(model, fp16=fp16, paddle_kwrags=paddle_kwrags, **kwargs) + super(PaddleSingleDriver, self).__init__(model, fp16=fp16, paddle_kwargs=paddle_kwargs, **kwargs) if device is None: raise ValueError("Parameter `device` can not be None in `PaddleSingleDriver`.") diff --git a/fastNLP/core/drivers/torch_driver/ddp.py b/fastNLP/core/drivers/torch_driver/ddp.py index 47d9cbb5..28670071 100644 --- a/fastNLP/core/drivers/torch_driver/ddp.py +++ b/fastNLP/core/drivers/torch_driver/ddp.py @@ -249,7 +249,7 @@ class TorchDDPDriver(TorchDriver): parallel_device: Optional[Union[List["torch.device"], "torch.device"]], is_pull_by_torch_run: bool = False, fp16: bool = False, - torch_kwargs: Dict = {}, + torch_kwargs: Dict = None, **kwargs ): diff --git a/fastNLP/core/drivers/torch_driver/deepspeed.py b/fastNLP/core/drivers/torch_driver/deepspeed.py index aedff1e9..2fc6e96e 100644 --- a/fastNLP/core/drivers/torch_driver/deepspeed.py +++ b/fastNLP/core/drivers/torch_driver/deepspeed.py @@ -111,7 +111,7 @@ class DeepSpeedDriver(TorchDDPDriver): parallel_device: Union[List["torch.device"], "torch.device"], is_pull_by_torch_run = False, fp16: bool = False, - deepspeed_kwargs: Dict = {}, + deepspeed_kwargs: Dict = None, **kwargs ): assert _NEED_IMPORT_DEEPSPEED, "Deepspeed is not imported." @@ -251,9 +251,9 @@ class DeepSpeedDriver(TorchDDPDriver): if not self.outside_ddp: torch.cuda.set_device(self.model_device) - # TODO 模型过大的话应该会导致显存溢出,但是不加的话显存会占用rank对应的设备 - # lightning里在之前通过broadcast_list广播了log_dir所以没有这种情况 - self.model.to(self.model_device) + # 不加 dist.broadcast_object_list 会发生设备在 4,5 但是模型会同步到 0,1 的情况 + # 原因未知 + dist.broadcast_object_list(["test"], 0, None) self.configure_ddp() self.barrier() diff --git a/fastNLP/core/drivers/torch_driver/fairscale.py b/fastNLP/core/drivers/torch_driver/fairscale.py index 02dda6a6..304f0bfa 100644 --- a/fastNLP/core/drivers/torch_driver/fairscale.py +++ b/fastNLP/core/drivers/torch_driver/fairscale.py @@ -35,11 +35,12 @@ class FairScaleDriver(TorchDDPDriver): parallel_device: Union[List["torch.device"], "torch.device"], is_pull_by_torch_run = False, fp16: bool = False, + fairscale_kwargs: Dict = None, **kwargs ): assert _NEED_IMPORT_FAIRSCALE, "fairscale is not imported." assert not dist.is_initialized(), "FairScaleDriver does not support initialize distributed by user." - self._fairscale_kwargs = kwargs.get('fairscale_kwargs', {}) + self._fairscale_kwargs = fairscale_kwargs self.fs_type = self._fairscale_kwargs.get('fs_type', 'sdp') # ddp, sdp, fsdp if self.fs_type == 'fsdp': self._fairscale_kwargs['set_grad_to_none'] = self._fairscale_kwargs.get('set_grad_to_none', True) diff --git a/fastNLP/core/drivers/torch_driver/single_device.py b/fastNLP/core/drivers/torch_driver/single_device.py index b59aba64..483dc257 100644 --- a/fastNLP/core/drivers/torch_driver/single_device.py +++ b/fastNLP/core/drivers/torch_driver/single_device.py @@ -41,7 +41,7 @@ class TorchSingleDriver(TorchDriver): * *gradscaler_kwargs* -- 用于 fp16=True 时,提供给 ``torch.amp.cuda.GradScaler`` 的参数; """ - def __init__(self, model, device: "torch.device", fp16: bool = False, torch_kwargs: Dict = {}, **kwargs): + def __init__(self, model, device: "torch.device", fp16: bool = False, torch_kwargs: Dict = None, **kwargs): if isinstance(model, DistributedDataParallel): raise ValueError("`DistributedDataParallel` is not supported in `TorchSingleDriver`") diff --git a/fastNLP/core/drivers/torch_driver/torch_driver.py b/fastNLP/core/drivers/torch_driver/torch_driver.py index 60bd4147..a748aa32 100644 --- a/fastNLP/core/drivers/torch_driver/torch_driver.py +++ b/fastNLP/core/drivers/torch_driver/torch_driver.py @@ -51,11 +51,11 @@ class TorchDriver(Driver): :param fp16: 是否开启混合精度训练; :param torch_kwargs: """ - def __init__(self, model, fp16: Optional[bool] = False, torch_kwargs: Dict = {}, **kwargs): + def __init__(self, model, fp16: Optional[bool] = False, torch_kwargs: Dict = None, **kwargs): super(TorchDriver, self).__init__(model) """ 进行 fp16 的设置 """ - self._torch_kwargs = torch_kwargs + self._torch_kwargs = torch_kwargs if torch_kwargs is not None else {} # 因为 ddp 和 single_device 的混合精度训练的设置是一样的,因此可以统一抽象到这里; self.fp16 = fp16