Browse Source

修改Driver 各自 kwargs 默认值为 None

tags/v1.0.0alpha
x54-729 2 years ago
parent
commit
417a27f6c9
15 changed files with 30 additions and 31 deletions
  1. +2
    -2
      fastNLP/core/drivers/jittor_driver/jittor_driver.py
  2. +1
    -1
      fastNLP/core/drivers/jittor_driver/mpi.py
  3. +1
    -1
      fastNLP/core/drivers/jittor_driver/single_device.py
  4. +2
    -2
      fastNLP/core/drivers/oneflow_driver/ddp.py
  5. +2
    -2
      fastNLP/core/drivers/oneflow_driver/oneflow_driver.py
  6. +2
    -2
      fastNLP/core/drivers/oneflow_driver/single_device.py
  7. +5
    -8
      fastNLP/core/drivers/paddle_driver/fleet.py
  8. +1
    -1
      fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
  9. +2
    -1
      fastNLP/core/drivers/paddle_driver/paddle_driver.py
  10. +2
    -2
      fastNLP/core/drivers/paddle_driver/single_device.py
  11. +1
    -1
      fastNLP/core/drivers/torch_driver/ddp.py
  12. +4
    -4
      fastNLP/core/drivers/torch_driver/deepspeed.py
  13. +2
    -1
      fastNLP/core/drivers/torch_driver/fairscale.py
  14. +1
    -1
      fastNLP/core/drivers/torch_driver/single_device.py
  15. +2
    -2
      fastNLP/core/drivers/torch_driver/torch_driver.py

+ 2
- 2
fastNLP/core/drivers/jittor_driver/jittor_driver.py View File

@@ -55,7 +55,7 @@ class JittorDriver(Driver):
:param fp16: 是否开启混合精度训练; :param fp16: 是否开启混合精度训练;
:param jittor_kwargs: :param jittor_kwargs:
""" """
def __init__(self, model, fp16: bool = False, jittor_kwargs: Dict = {}, **kwargs):
def __init__(self, model, fp16: bool = False, jittor_kwargs: Dict = None, **kwargs):
if not isinstance(model, Module): if not isinstance(model, Module):
raise ValueError(f"Parameter `model` can not be `{type(model)}` in `JittorDriver`, it should be exactly " raise ValueError(f"Parameter `model` can not be `{type(model)}` in `JittorDriver`, it should be exactly "
f"`jittor.Module` type.") f"`jittor.Module` type.")
@@ -67,7 +67,7 @@ class JittorDriver(Driver):
jt.flags.auto_mixed_precision_level = 0 jt.flags.auto_mixed_precision_level = 0
self.fp16 = fp16 self.fp16 = fp16
self._auto_cast = nullcontext self._auto_cast = nullcontext
self._jittor_kwargs = jittor_kwargs
self._jittor_kwargs = jittor_kwargs if jittor_kwargs is not None else {}


# 用来设置是否关闭 auto_param_call 中的参数匹配问题; # 用来设置是否关闭 auto_param_call 中的参数匹配问题;
self.wo_auto_param_call = kwargs.get("model_wo_auto_param_call", False) self.wo_auto_param_call = kwargs.get("model_wo_auto_param_call", False)


+ 1
- 1
fastNLP/core/drivers/jittor_driver/mpi.py View File

@@ -34,7 +34,7 @@ class JittorMPIDriver(JittorDriver):
parallel_device: None, parallel_device: None,
is_pull_by_jittor_run: bool = False, is_pull_by_jittor_run: bool = False,
fp16: bool = False, fp16: bool = False,
jittor_kwargs: Dict = {},
jittor_kwargs: Dict = None,
**kwargs **kwargs
): ):




+ 1
- 1
fastNLP/core/drivers/jittor_driver/single_device.py View File

@@ -37,7 +37,7 @@ class JittorSingleDriver(JittorDriver):
:param jittor_kwargs: :param jittor_kwargs:
""" """


def __init__(self, model, device=None, fp16: bool = False, jittor_kwargs: Dict = {}, **kwargs):
def __init__(self, model, device=None, fp16: bool = False, jittor_kwargs: Dict = None, **kwargs):
if device not in [None, "cpu", "gpu", "cuda"]: if device not in [None, "cpu", "gpu", "cuda"]:
raise RuntimeError("Parameter `device` should be one of [None, 'cpu', 'gpu', 'cuda'] .") raise RuntimeError("Parameter `device` should be one of [None, 'cpu', 'gpu', 'cuda'] .")
super(JittorSingleDriver, self).__init__(model, fp16, jittor_kwargs=jittor_kwargs) super(JittorSingleDriver, self).__init__(model, fp16, jittor_kwargs=jittor_kwargs)


+ 2
- 2
fastNLP/core/drivers/oneflow_driver/ddp.py View File

@@ -46,7 +46,7 @@ class OneflowDDPDriver(OneflowDriver):
任何当前有多少台机器的信息; 任何当前有多少台机器的信息;


:param model: 传入给 ``Trainer`` 的 ``model`` 参数; :param model: 传入给 ``Trainer`` 的 ``model`` 参数;
:param parallel_device: 该参数无效,**FastNLP** 会自动获取当前进程的设备;
:param parallel_device: 该参数无效,**fastNLP** 会自动获取当前进程的设备;
:param fp16: 是否开启 fp16 训练;目前该参数无效; :param fp16: 是否开启 fp16 训练;目前该参数无效;
:param oneflow_kwargs: :param oneflow_kwargs:
* *ddp_kwargs* -- 用于 ``DistributedDataParallel`` 的其它参数,详情可查阅 **oneflow** 的官方文档; * *ddp_kwargs* -- 用于 ``DistributedDataParallel`` 的其它参数,详情可查阅 **oneflow** 的官方文档;
@@ -57,7 +57,7 @@ class OneflowDDPDriver(OneflowDriver):
model, model,
parallel_device: Optional["oneflow.device"], parallel_device: Optional["oneflow.device"],
fp16: bool = False, fp16: bool = False,
oneflow_kwargs: Dict = {},
oneflow_kwargs: Dict = None,
**kwargs **kwargs
): ):




+ 2
- 2
fastNLP/core/drivers/oneflow_driver/oneflow_driver.py View File

@@ -48,11 +48,11 @@ class OneflowDriver(Driver):
您可以在使用 ``OneflowSingleDriver`` 和 ``OneflowDDPDriver`` 时使用 ``OneflowDriver`` 提供的接口; 您可以在使用 ``OneflowSingleDriver`` 和 ``OneflowDDPDriver`` 时使用 ``OneflowDriver`` 提供的接口;


""" """
def __init__(self, model, fp16: Optional[bool] = False, oneflow_kwargs: Dict = {}, **kwargs):
def __init__(self, model, fp16: Optional[bool] = False, oneflow_kwargs: Dict = None, **kwargs):
super(OneflowDriver, self).__init__(model) super(OneflowDriver, self).__init__(model)


""" 进行 fp16 的设置 """ """ 进行 fp16 的设置 """
self._oneflow_kwargs = oneflow_kwargs
self._oneflow_kwargs = oneflow_kwargs if oneflow_kwargs is not None else {}


self.fp16 = fp16 self.fp16 = fp16
if fp16: if fp16:


+ 2
- 2
fastNLP/core/drivers/oneflow_driver/single_device.py View File

@@ -29,14 +29,14 @@ class OneflowSingleDriver(OneflowDriver):
:param oneflow_kwargs: :param oneflow_kwargs:
""" """


def __init__(self, model, device: "oneflow.device", fp16: bool = False, oneflow_kwargs: Dict = {}, **kwargs):
def __init__(self, model, device: "oneflow.device", fp16: bool = False, oneflow_kwargs: Dict = None, **kwargs):
cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None) cuda_visible_devices = os.environ.get("CUDA_VISIBLE_DEVICES", None)
if cuda_visible_devices == "": if cuda_visible_devices == "":
device = oneflow.device("cpu") device = oneflow.device("cpu")
logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to" logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to"
"use `cpu` instead of `gpu` device.") "use `cpu` instead of `gpu` device.")


super(OneflowSingleDriver, self).__init__(model, fp16=fp16, **kwargs)
super(OneflowSingleDriver, self).__init__(model, fp16=fp16, oneflow_kwargs=oneflow_kwargs, **kwargs)


if device is None: if device is None:
logger.debug("device is not set, fastNLP will try to automatically get it.") logger.debug("device is not set, fastNLP will try to automatically get it.")


+ 5
- 8
fastNLP/core/drivers/paddle_driver/fleet.py View File

@@ -152,12 +152,12 @@ class PaddleFleetDriver(PaddleDriver):
parallel_device: Optional[Union[List[str], str]], parallel_device: Optional[Union[List[str], str]],
is_pull_by_paddle_run: bool = False, is_pull_by_paddle_run: bool = False,
fp16: bool = False, fp16: bool = False,
paddle_kwrags: Dict = {},
paddle_kwargs: Dict = None,
**kwargs **kwargs
): ):
if USER_CUDA_VISIBLE_DEVICES not in os.environ: if USER_CUDA_VISIBLE_DEVICES not in os.environ:
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
super(PaddleFleetDriver, self).__init__(model, fp16=fp16, paddle_kwrags=paddle_kwargs, **kwargs)
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using fastNLP.")
super(PaddleFleetDriver, self).__init__(model, fp16=fp16, paddle_kwargs=paddle_kwargs, **kwargs)


# 如果不是通过 launch 启动,要求用户必须传入 parallel_device # 如果不是通过 launch 启动,要求用户必须传入 parallel_device
if not is_pull_by_paddle_run: if not is_pull_by_paddle_run:
@@ -195,17 +195,14 @@ class PaddleFleetDriver(PaddleDriver):
self.world_size = None self.world_size = None
self.global_rank = 0 self.global_rank = 0
self.gloo_rendezvous_dir = None self.gloo_rendezvous_dir = None

# 分布式环境的其它参数设置
paddle_kwargs = kwargs.get("paddle_kwargs", {})
self._fleet_kwargs = paddle_kwargs.get("fleet_kwargs", {})
self._fleet_kwargs = self._paddle_kwargs.get("fleet_kwargs", {})
check_user_specific_params(self._fleet_kwargs, DataParallel.__init__, DataParallel.__name__) check_user_specific_params(self._fleet_kwargs, DataParallel.__init__, DataParallel.__name__)
# fleet.init 中对于分布式策略的设置,详情可以参考 PaddlePaddle 的官方文档 # fleet.init 中对于分布式策略的设置,详情可以参考 PaddlePaddle 的官方文档
self.strategy = self._fleet_kwargs.get("strategy", fleet.DistributedStrategy()) self.strategy = self._fleet_kwargs.get("strategy", fleet.DistributedStrategy())
self.is_collective = self._fleet_kwargs.pop("is_collective", True) self.is_collective = self._fleet_kwargs.pop("is_collective", True)
if not self.is_collective: if not self.is_collective:
raise NotImplementedError("FastNLP only support `collective` for distributed training now.")
raise NotImplementedError("fastNLP only support `collective` for distributed training now.")
self.role_maker = self._fleet_kwargs.pop("role_maker", None) self.role_maker = self._fleet_kwargs.pop("role_maker", None)


self.output_from_new_proc = kwargs.get("output_from_new_proc", "only_error") self.output_from_new_proc = kwargs.get("output_from_new_proc", "only_error")


+ 1
- 1
fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py View File

@@ -38,7 +38,7 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[
user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) user_visible_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES)
if is_in_paddle_launch_dist(): if is_in_paddle_launch_dist():
if user_visible_devices is None: if user_visible_devices is None:
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using FastNLP.")
raise RuntimeError("To run paddle distributed training, please set `FASTNLP_BACKEND` to 'paddle' before using fastNLP.")
if device is not None: if device is not None:
logger.rank_zero_warning("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " logger.rank_zero_warning("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull "
"up your script. And we will directly get the local device via environment variables.", once=True) "up your script. And we will directly get the local device via environment variables.", once=True)


+ 2
- 1
fastNLP/core/drivers/paddle_driver/paddle_driver.py View File

@@ -70,13 +70,14 @@ class PaddleDriver(Driver):
:param paddle_kwargs: :param paddle_kwargs:


""" """
def __init__(self, model: "paddle.nn.Layer", fp16: Optional[bool] = False, paddle_kwrags: Dict = {}, **kwargs):
def __init__(self, model: "paddle.nn.Layer", fp16: Optional[bool] = False, paddle_kwargs: Dict = None, **kwargs):
if not isinstance(model, paddle.nn.Layer): if not isinstance(model, paddle.nn.Layer):
raise ValueError(f"Parameter `model` can not be `{type(model)}` in `PaddleDriver`, it should be exactly " raise ValueError(f"Parameter `model` can not be `{type(model)}` in `PaddleDriver`, it should be exactly "
f"`paddle.nn.Layer` type.") f"`paddle.nn.Layer` type.")


super(PaddleDriver, self).__init__(model) super(PaddleDriver, self).__init__(model)
self.fp16 = fp16 self.fp16 = fp16
self._paddle_kwargs = paddle_kwargs if paddle_kwargs is not None else {}


# scaler的参数 # scaler的参数
self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16) self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16)


+ 2
- 2
fastNLP/core/drivers/paddle_driver/single_device.py View File

@@ -53,7 +53,7 @@ class PaddleSingleDriver(PaddleDriver):
关于该参数的详细说明,请参见 :class:`~fastNLP.core.controllers.Trainer` 中的描述;函数 ``auto_param_call`` 详见 :func:`fastNLP.core.utils.auto_param_call`。 关于该参数的详细说明,请参见 :class:`~fastNLP.core.controllers.Trainer` 中的描述;函数 ``auto_param_call`` 详见 :func:`fastNLP.core.utils.auto_param_call`。


""" """
def __init__(self, model: "paddle.nn.Layer", device: Union[str, int], fp16: Optional[bool] = False, paddle_kwrags: Dict = {}, **kwargs):
def __init__(self, model: "paddle.nn.Layer", device: Union[str, int], fp16: Optional[bool] = False, paddle_kwargs: Dict = None, **kwargs):
if isinstance(model, DataParallel): if isinstance(model, DataParallel):
raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`") raise ValueError("`paddle.DataParallel` is not supported in `PaddleSingleDriver`")


@@ -63,7 +63,7 @@ class PaddleSingleDriver(PaddleDriver):
logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to" logger.info("You have set `CUDA_VISIBLE_DEVICES` to '' in system environment variable, and we are gonna to"
"use `cpu` instead of `gpu` device.") "use `cpu` instead of `gpu` device.")


super(PaddleSingleDriver, self).__init__(model, fp16=fp16, paddle_kwrags=paddle_kwrags, **kwargs)
super(PaddleSingleDriver, self).__init__(model, fp16=fp16, paddle_kwargs=paddle_kwargs, **kwargs)


if device is None: if device is None:
raise ValueError("Parameter `device` can not be None in `PaddleSingleDriver`.") raise ValueError("Parameter `device` can not be None in `PaddleSingleDriver`.")


+ 1
- 1
fastNLP/core/drivers/torch_driver/ddp.py View File

@@ -249,7 +249,7 @@ class TorchDDPDriver(TorchDriver):
parallel_device: Optional[Union[List["torch.device"], "torch.device"]], parallel_device: Optional[Union[List["torch.device"], "torch.device"]],
is_pull_by_torch_run: bool = False, is_pull_by_torch_run: bool = False,
fp16: bool = False, fp16: bool = False,
torch_kwargs: Dict = {},
torch_kwargs: Dict = None,
**kwargs **kwargs
): ):




+ 4
- 4
fastNLP/core/drivers/torch_driver/deepspeed.py View File

@@ -111,7 +111,7 @@ class DeepSpeedDriver(TorchDDPDriver):
parallel_device: Union[List["torch.device"], "torch.device"], parallel_device: Union[List["torch.device"], "torch.device"],
is_pull_by_torch_run = False, is_pull_by_torch_run = False,
fp16: bool = False, fp16: bool = False,
deepspeed_kwargs: Dict = {},
deepspeed_kwargs: Dict = None,
**kwargs **kwargs
): ):
assert _NEED_IMPORT_DEEPSPEED, "Deepspeed is not imported." assert _NEED_IMPORT_DEEPSPEED, "Deepspeed is not imported."
@@ -251,9 +251,9 @@ class DeepSpeedDriver(TorchDDPDriver):


if not self.outside_ddp: if not self.outside_ddp:
torch.cuda.set_device(self.model_device) torch.cuda.set_device(self.model_device)
# TODO 模型过大的话应该会导致显存溢出,但是不加的话显存会占用rank对应的设备
# lightning里在之前通过broadcast_list广播了log_dir所以没有这种情况
self.model.to(self.model_device)
# 不加 dist.broadcast_object_list 会发生设备在 4,5 但是模型会同步到 0,1 的情况
# 原因未知
dist.broadcast_object_list(["test"], 0, None)
self.configure_ddp() self.configure_ddp()


self.barrier() self.barrier()


+ 2
- 1
fastNLP/core/drivers/torch_driver/fairscale.py View File

@@ -35,11 +35,12 @@ class FairScaleDriver(TorchDDPDriver):
parallel_device: Union[List["torch.device"], "torch.device"], parallel_device: Union[List["torch.device"], "torch.device"],
is_pull_by_torch_run = False, is_pull_by_torch_run = False,
fp16: bool = False, fp16: bool = False,
fairscale_kwargs: Dict = None,
**kwargs **kwargs
): ):
assert _NEED_IMPORT_FAIRSCALE, "fairscale is not imported." assert _NEED_IMPORT_FAIRSCALE, "fairscale is not imported."
assert not dist.is_initialized(), "FairScaleDriver does not support initialize distributed by user." assert not dist.is_initialized(), "FairScaleDriver does not support initialize distributed by user."
self._fairscale_kwargs = kwargs.get('fairscale_kwargs', {})
self._fairscale_kwargs = fairscale_kwargs
self.fs_type = self._fairscale_kwargs.get('fs_type', 'sdp') # ddp, sdp, fsdp self.fs_type = self._fairscale_kwargs.get('fs_type', 'sdp') # ddp, sdp, fsdp
if self.fs_type == 'fsdp': if self.fs_type == 'fsdp':
self._fairscale_kwargs['set_grad_to_none'] = self._fairscale_kwargs.get('set_grad_to_none', True) self._fairscale_kwargs['set_grad_to_none'] = self._fairscale_kwargs.get('set_grad_to_none', True)


+ 1
- 1
fastNLP/core/drivers/torch_driver/single_device.py View File

@@ -41,7 +41,7 @@ class TorchSingleDriver(TorchDriver):
* *gradscaler_kwargs* -- 用于 fp16=True 时,提供给 ``torch.amp.cuda.GradScaler`` 的参数; * *gradscaler_kwargs* -- 用于 fp16=True 时,提供给 ``torch.amp.cuda.GradScaler`` 的参数;
""" """


def __init__(self, model, device: "torch.device", fp16: bool = False, torch_kwargs: Dict = {}, **kwargs):
def __init__(self, model, device: "torch.device", fp16: bool = False, torch_kwargs: Dict = None, **kwargs):
if isinstance(model, DistributedDataParallel): if isinstance(model, DistributedDataParallel):
raise ValueError("`DistributedDataParallel` is not supported in `TorchSingleDriver`") raise ValueError("`DistributedDataParallel` is not supported in `TorchSingleDriver`")




+ 2
- 2
fastNLP/core/drivers/torch_driver/torch_driver.py View File

@@ -51,11 +51,11 @@ class TorchDriver(Driver):
:param fp16: 是否开启混合精度训练; :param fp16: 是否开启混合精度训练;
:param torch_kwargs: :param torch_kwargs:
""" """
def __init__(self, model, fp16: Optional[bool] = False, torch_kwargs: Dict = {}, **kwargs):
def __init__(self, model, fp16: Optional[bool] = False, torch_kwargs: Dict = None, **kwargs):
super(TorchDriver, self).__init__(model) super(TorchDriver, self).__init__(model)


""" 进行 fp16 的设置 """ """ 进行 fp16 的设置 """
self._torch_kwargs = torch_kwargs
self._torch_kwargs = torch_kwargs if torch_kwargs is not None else {}


# 因为 ddp 和 single_device 的混合精度训练的设置是一样的,因此可以统一抽象到这里; # 因为 ddp 和 single_device 的混合精度训练的设置是一样的,因此可以统一抽象到这里;
self.fp16 = fp16 self.fp16 = fp16


Loading…
Cancel
Save