@@ -0,0 +1,11 @@ | |||
__all__ = [ | |||
"PaddleDriver", | |||
"PaddleSingleDriver", | |||
"PaddleFleetDriver", | |||
"paddle_seed_everything", | |||
] | |||
from .paddle_driver import PaddleDriver | |||
from .single_device import PaddleSingleDriver | |||
from .fleet import PaddleFleetDriver | |||
from .utils import paddle_seed_everything |
@@ -0,0 +1,426 @@ | |||
import os | |||
from functools import partial | |||
from typing import List, Union, Optional, Dict | |||
from .paddle_driver import PaddleDriver | |||
from .fleet_launcher import FleetLauncher | |||
from .utils import ( | |||
_FleetWrappingModel, | |||
ForwardState, | |||
_MODE_PARAMETER, | |||
get_host_name_ip, | |||
get_device_from_visible, | |||
reset_seed, | |||
) | |||
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
from fastNLP.core.utils import ( | |||
auto_param_call, | |||
check_user_specific_params, | |||
paddle_move_data_to_device, | |||
is_in_paddle_dist, | |||
) | |||
from fastNLP.core.samplers import ReproducibleIterator, RandomSampler, UnrepeatedDistributedSampler | |||
from fastNLP.envs.env import FASTNLP_DISTRIBUTED_CHECK, USER_CUDA_VISIBLE_DEVICES | |||
from fastNLP.core.log import logger | |||
if _NEED_IMPORT_PADDLE: | |||
import paddle | |||
from paddle import DataParallel | |||
import paddle.distributed.fleet as fleet | |||
import paddle.distributed as dist | |||
from paddle.io import BatchSampler | |||
from paddle.optimizer import Optimizer | |||
from paddle.fluid.reader import _DatasetKind | |||
from paddle.fluid.dygraph import parallel_helper | |||
__all__ = [ | |||
"PaddleFleetDriver", | |||
] | |||
# if os.path.exists(self.gloo_rendezvous_dir): | |||
# shutil.rmtree(self.gloo_rendezvous_dir) | |||
class PaddleFleetDriver(PaddleDriver): | |||
def __init__( | |||
self, | |||
model, | |||
parallel_device: Optional[Union[List[int], int]], | |||
is_pull_by_paddle_run: bool = False, | |||
fp16: bool = False, | |||
**kwargs | |||
): | |||
""" | |||
采用fleet接口进行并行paddle训练的driver | |||
PaddleFleetDriver 目前考虑支持的三种启动方式: | |||
1. 用户自己不进行 fleet 的任何操作,直接使用我们的 Trainer,并且只运行一个 main 脚本,这时是由我们自己使用 open_subprocesses 拉起 | |||
多个进程,然后由 Driver 自己进行初始化 | |||
2. 其它情况同 1,但是用户自己使用 python -m paddle.distributed.launch 拉起; | |||
3. 用户自己在外面初始化 Fleet,并且通过 python -m paddle.distributed.launch 拉起; | |||
注意多机的启动强制要求用户在每一台机器上使用 python -m paddle.distributed.launch 启动; | |||
如果用户自己在外面初始化了 fleet,那么 | |||
parallel_device 为 None; | |||
data_device 为 表示单卡的一个参数; | |||
dist.is_initialized 为 true; | |||
""" | |||
super(PaddleFleetDriver, self).__init__(model, fp16=fp16, **kwargs) | |||
# 如果不是通过 launch 启动,要求用户必须传入 parallel_device | |||
if not is_pull_by_paddle_run and parallel_device is None: | |||
raise ValueError("Parameter `parallel_device` can not be None when using `PaddleFleetDriver`. This error is caused " | |||
"when your value of parameter `device` is `None` in your `Trainer` instance.") | |||
# 如果用户自己初始化了 paddle 的分布式训练那么一定是通过 launch 拉起的 | |||
self.is_pull_by_paddle_run = is_pull_by_paddle_run | |||
self.parallel_device = parallel_device | |||
# 在初始化时,如果发现 is_pull_by_paddle_run ,则将 parallel_device 设置成当前进程的gpu | |||
if is_pull_by_paddle_run: | |||
self._model_device = parallel_device | |||
else: | |||
self._model_device = parallel_device[self.local_rank] | |||
# 如果用户自己在外面初始化了并行模型; | |||
self.outside_fleet = False | |||
# 检测 paddle 分布式的环境变量 | |||
if parallel_helper._is_parallel_ctx_initialized(): | |||
# 如果用户自己在外面初始化了 DDP,那么我们要求用户传入的模型一定是已经由 DistributedDataParallel 包裹后的模型; | |||
if not isinstance(model, DataParallel): | |||
raise RuntimeError( | |||
"It is not allowed to input a normal model instead of `paddle.DataParallel` when" | |||
"you initialize the paddle distribued process out of our control.") | |||
self.outside_fleet = True | |||
# 用户只有将模型上传到对应机器上后才能用 DataParallel 包裹,因此如果用户在外面初始化了 Fleet,那么在 PaddleFleetDriver 中 | |||
# 我们就直接将 model_device 置为 None; | |||
self._model_device = None | |||
def _running_fn_(batch, step_fn, signature_fn): | |||
if isinstance(batch, Dict): | |||
return auto_param_call(step_fn, batch, signature_fn=signature_fn) | |||
else: | |||
return self._validate_step(batch) | |||
model = model._layers | |||
if hasattr(model, "train_step"): | |||
logger.warning( | |||
"Notice your model is a `paddle.DataParallel` model. And your " | |||
"model also implements the `train_step` method, which we can not call actually, we will" | |||
" call `forward` function instead of `train_step` and you should note that.") | |||
self._train_step = partial(_running_fn_, step_fn=self.model, signature_fn=model.forward) | |||
# self._train_signature_fn = model.forward | |||
if hasattr(model, "validate_step"): | |||
logger.warning( | |||
"Notice your model is a `paddle.DataParallel` model. And your " | |||
"model also implements the `validate_step` method, which we can not call actually, " | |||
"we will call `forward` function instead of `validate_step` and you should note that.") | |||
self._validate_step = partial(_running_fn_, step_fn=self.model, signature_fn=model.forward) | |||
# self._validate_signature_fn = model.forward | |||
if hasattr(model, "test_step"): | |||
logger.warning( | |||
"Notice your model is a `paddle.DataParallel` model. And your " | |||
"model also implements the `test_step` method, which we can not call actually, we will" | |||
" call `forward` function instead of `test_step` and you should note that.") | |||
self._test_step = partial(_running_fn_, step_fn=self.model, signature_fn=model.forward) | |||
# 当参数 `device` 为 None 时并且该参数不为 None,表示将对应的数据移到指定的机器上; | |||
self._data_device = kwargs.get("_data_device", None) | |||
if self._data_device is not None: | |||
if isinstance(self._data_device, int): | |||
if self._data_device < 0: | |||
raise ValueError("Parameter `_data_device` can not be smaller than 0.") | |||
_could_use_device_num = paddle.device.cuda.device_count() | |||
if self._data_device >= _could_use_device_num: | |||
raise ValueError("The gpu device that parameter `device` specifies is not existed.") | |||
self._data_device = f"gpu:{self._data_device}" | |||
elif not isinstance(self._data_device, str): | |||
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") | |||
if self.outside_fleet and paddle.device.get_device() != self._data_device: | |||
logger.warning("`Parameter data_device` is not equal to paddle.deivce.get_device(), " | |||
"please keep them equal to avoid some potential bugs.") | |||
if not self.outside_fleet and parallel_device is None: | |||
raise ValueError("Parameter `parallel_device` can not be None when using `PaddleFleetDriver`. This error is caused " | |||
"when your value of parameter `device` is `None` in your `Trainer` instance.") | |||
# 可能需要放在参数里 | |||
self.strategy = kwargs.get("strategy", fleet.DistributedStrategy()) | |||
self.is_collective = kwargs.get("is_collective", True) | |||
if not self.is_collective: | |||
raise NotImplementedError("FastNLP dose not support `parameters server` for distributed training now.") | |||
self.role_maker = kwargs.get("role_maker", None) | |||
self._master_port = None | |||
self.world_size = None | |||
self.global_rank = 0 | |||
self._configured = False # 防止重复调用 configure_ddp() 函数使用 | |||
self._has_setup = False # 防止重复调用 setup() 函数 | |||
self._fleet_kwargs = kwargs.get("paddle_fleet_kwargs", {}) | |||
check_user_specific_params(self._fleet_kwargs, DataParallel.__init__) | |||
# TODO 对这些参数的检查 | |||
if self.local_rank == 0 and not is_in_paddle_dist(): | |||
# 由于使用driver时模型一定会被初始化,因此在一开始程序一定会占用一部分显存来存放模型,然而这部分显存没有 | |||
# 发挥任何作用。 | |||
logger.warning(f"The program will use some extra space on {paddle.device.get_device()} to place your model since the model " | |||
"has already been initialized.") | |||
self.output_from_new_proc = kwargs.get("output_from_new_proc", "only_error") | |||
assert isinstance(self.output_from_new_proc, str), "Parameter `output_from_new_proc` can only be `str` type." | |||
if self.output_from_new_proc not in {"all", "ignore", "only_error"}: | |||
os.makedirs(name=self.output_from_new_proc, exist_ok=True) | |||
self.output_from_new_proc = os.path.abspath(self.output_from_new_proc) | |||
def setup(self): | |||
""" | |||
在主进程拉起其它子进程,将主进程作为rank 0 | |||
""" | |||
if self._has_setup: | |||
return | |||
self._has_setup = True | |||
# 如果用户需要使用多机模式,那么一定进入到这里; | |||
if self.is_pull_by_paddle_run: | |||
if self.outside_fleet: | |||
# 已经初始化了多机环境 | |||
self.set_from_fleet_environment() | |||
else: | |||
# 用户没有初始化多机环境 | |||
# TODO 绕一下 | |||
# dist.get_world_size() 只能在初始化之后进行调用; | |||
self.world_size = int(os.environ.get("PADDLE_TRAINERS_NUM")) | |||
self.global_rank = int(os.environ.get("PADDLE_TRAINER_ID")) | |||
reset_seed() | |||
logger.warning(f"\nworld size, global rank: {self.world_size}, {self.global_rank}\n") | |||
fleet.init(self.role_maker, self.is_collective, self.strategy) | |||
else: | |||
# 在用户只使用了一个分布式 trainer 的情况下 | |||
# 此时 parallel_helper._is_parallel_ctx_initialized() 一定为 False | |||
# parallel_device 是 list, | |||
# if self.local_rank == 0 and FASTNLP_DISTRIBUTED_CHECK not in os.environ: | |||
if not parallel_helper._is_parallel_ctx_initialized(): | |||
# 没有初始化分布式环境,且是主进程 | |||
self.init_fleet_and_set() | |||
# 用户在这个 trainer 前面又初始化了一个 trainer,并且使用的是 PaddleFleetDriver; | |||
else: | |||
# 已经设置过一次,保证参数必须是一样的 | |||
pre_gpus = os.environ[FASTNLP_DISTRIBUTED_CHECK] | |||
pre_gpus = [int (x) for x in pre_gpus.split(",")] | |||
if sorted(pre_gpus) != sorted(self.parallel_device): | |||
raise RuntimeError("Notice you are using `PaddleFleetDriver` after one instantiated `PaddleFleetDriver`, it is not" | |||
"allowed that your second `PaddleFleetDriver` has a new setting of parameters `parallel_device`.") | |||
if not self.outside_fleet: | |||
# self.model.to(self.model_device) | |||
self.configure_fleet() | |||
# 初始化 self._pids,从而使得每一个进程都能接受到 rank0 的 send 操作; | |||
# TODO 不用.to会怎么样? | |||
self._pids = [] | |||
dist.all_gather(self._pids, paddle.to_tensor(os.getpid(), dtype="int32")) | |||
# TODO LOCAL_WORLD_SIZE | |||
local_world_size = int(os.environ.get("LOCAL_WORLD_SIZE")) if "LOCAL_WORLD_SIZE" in os.environ else None | |||
if local_world_size is None: | |||
local_world_size = paddle.to_tensor(self.local_rank, dtype="int32") | |||
dist.all_reduce(local_world_size, op=dist.ReduceOp.MAX) | |||
local_world_size = local_world_size.item() + 1 | |||
node_rank = self.global_rank // local_world_size | |||
self._pids = self._pids[node_rank*local_world_size: (node_rank+1)*local_world_size] | |||
self._pids = self.tensor_to_numeric(self._pids) | |||
def init_fleet_and_set(self): | |||
""" | |||
使用 FleetLauncher 拉起子进程 | |||
""" | |||
if self.local_rank == 0: | |||
# 是 rank0 的话,则拉起其它子进程 | |||
launcher = FleetLauncher(self.parallel_device, self.output_from_new_proc) | |||
launcher.launch() | |||
# 设置参数和初始化分布式环境 | |||
reset_seed() | |||
fleet.init(self.role_maker, self.is_collective, self.strategy) | |||
self.global_rank = int(os.getenv("PADDLE_TRAINER_ID")) | |||
self.world_size = int(os.getenv("PADDLE_TRAINERS_NUM")) | |||
# 正常情况下不会Assert出问题,但还是保险一下 | |||
assert self.global_rank is not None | |||
assert self.world_size is not None | |||
assert self.world_size == len(self.parallel_device) | |||
def set_from_fleet_environment(self): | |||
""" | |||
当用户使用了 `python -m paddle.distributed.launch xxx.py` 启动时,我们需要 | |||
根据 paddle 设置的环境变量来获得各种属性 | |||
""" | |||
self.world_size = dist.get_world_size() | |||
self.global_rank = dist.get_rank() | |||
def barrier(self): | |||
dist.barrier() | |||
def configure_fleet(self): | |||
if not self._configured and not isinstance(self.model, DataParallel): | |||
self.model = DataParallel( | |||
_FleetWrappingModel(self.model), | |||
**self._fleet_kwargs | |||
) | |||
self._train_step = partial(self.model, **{_MODE_PARAMETER: ForwardState.TRAIN}) | |||
self._validate_step = partial(self.model, **{_MODE_PARAMETER: ForwardState.VALIDATE}) | |||
self._test_step = partial(self.model, **{_MODE_PARAMETER: ForwardState.TEST}) | |||
self._configured = True | |||
@property | |||
def world_size(self) -> int: | |||
return self._world_size | |||
@world_size.setter | |||
def world_size(self, size: int) -> None: | |||
self._world_size = size | |||
@property | |||
def global_rank(self) -> int: | |||
return self._global_rank | |||
@global_rank.setter | |||
def global_rank(self, rank: int) -> None: | |||
self._global_rank = rank | |||
@property | |||
def local_rank(self) -> int: | |||
return int(os.getenv("PADDLE_RANK_IN_NODE", "0")) | |||
@property | |||
def model_device(self): | |||
# 我认为这里的两个 device 应该返回真实值,对 CUDA_VISIBLDE_DEIVCES的转换应该在相应的 to 函数完成 | |||
# 否则会造成用户的困惑 | |||
return self._model_device | |||
@property | |||
def data_device(self): | |||
if self.outside_fleet: | |||
return self._data_device | |||
return self.model_device | |||
def train_step(self, batch): | |||
return self._train_step(batch) | |||
def validate_step(self, batch): | |||
return self._validate_step(batch) | |||
def test_step(self, batch): | |||
return self._test_step(batch) | |||
def replace_sampler(self, dataloader, dist_sampler: Optional[Union[str, ReproducibleIterator]] = "dist", reproducible: bool = False): | |||
# 暂时不支持iterableDataset | |||
assert dataloader.dataset_kind != _DatasetKind.ITER, \ | |||
"FastNLP does not support `IteratorDataset` now." | |||
if isinstance(dist_sampler, ReproducibleIterator): | |||
dataloader.batch_sampler.sampler = dist_sampler | |||
return dataloader | |||
# paddle 的 BatchSampler 和 DataLoader 没有 shuffle 成员,只能根据 sampler 判断 | |||
# 但是其子类 DistributedBatchSampler 却有 shuffle 成员 | |||
# 因此用 type() 进行严格的判断 | |||
if type(dataloader.batch_sampler) == BatchSampler: | |||
shuffle = isinstance(dataloader.batch_sampler.sampler, RandomSampler) | |||
else: | |||
shuffle = dataloader.batch_sampler.shuffle | |||
# trainer, evaluator | |||
if dist_sampler is None: | |||
if reproducible: | |||
raise RuntimeError("It is not allowed to use checkpoint retraining when you initialize fleet out of our " | |||
"control.") | |||
else: | |||
return dataloader | |||
# trainer | |||
elif dist_sampler == "dist": | |||
# 如果用户的 trainer.use_dist_sampler 为 True,那么此时其是否进行断点重训,不影响这里的行为; | |||
if isinstance(dataloader.batch_sampler.sampler, ReproducibleIterator): | |||
dataloader.batch_sampler.sampler.set_distributed( | |||
num_replicas=self.world_size, | |||
rank=self.global_rank, | |||
pad=True | |||
) | |||
return dataloader | |||
else: | |||
sampler = RandomSampler( | |||
dataset=dataloader.dataset, | |||
shuffle=shuffle, | |||
seed=int(os.environ.get("FASTNLP_SEED", 0)) | |||
) | |||
sampler.set_distributed( | |||
num_replicas=self.world_size, | |||
rank=self.global_rank, | |||
pad=True | |||
) | |||
dataloader.batch_sampler.sampler = sampler | |||
return dataloader | |||
# evaluator | |||
elif dist_sampler == "unrepeatdist": | |||
sampler = UnrepeatedDistributedSampler( | |||
dataset=dataloader.dataset, | |||
shuffle=shuffle, | |||
seed=int(os.environ.get("FASTNLP_SEED", 0)) | |||
) | |||
sampler.set_distributed( | |||
num_replicas=self.world_size, | |||
rank=self.global_rank | |||
) | |||
dataloader.batch_sampler.sampler = sampler | |||
return dataloader | |||
else: | |||
raise ValueError("Parameter `dist_sampler` can only be one of three values: ('dist', 'unrepeatdist', None).") | |||
def backward(self, loss): | |||
self.grad_scaler.scale(loss).backward() | |||
def step(self): | |||
for optimizer in self.optimizers: | |||
self.grad_scaler.step(optimizer) | |||
self.grad_scaler.update() | |||
def is_global_zero(self): | |||
return self.global_rank == 0 | |||
def get_no_sync_context(self): | |||
return self.model.no_sync | |||
def unwrap_model(self): | |||
_layers = self.model._layers | |||
if isinstance(_layers, _FleetWrappingModel): | |||
return _layers.model | |||
else: | |||
return _layers | |||
def get_local_rank(self) ->int: | |||
return self.local_rank | |||
def is_distributed(self): | |||
return True | |||
def move_data_to_device(self, batch: 'paddle.Tensor'): | |||
device = self.data_device | |||
# 因为设置了CUDA_VISIBLE_DEVICES,在子进程中可能会引起错误 | |||
if FASTNLP_DISTRIBUTED_CHECK in os.environ: | |||
device = get_device_from_visible(device) | |||
return paddle_move_data_to_device(batch, device) | |||
@staticmethod | |||
def _check_optimizer_legality(optimizers): | |||
""" | |||
paddle存在设置分布式optimizers的函数,返回值为fleet.meta_optimizers.HybridParallelOptimizer | |||
重写是为了防止单卡下也传入了分布式的优化器 | |||
""" | |||
DistribuedOptimizer = fleet.meta_optimizers.HybridParallelOptimizer | |||
for each_optimizer in optimizers: | |||
if not isinstance(each_optimizer, (Optimizer, DistribuedOptimizer)): | |||
raise ValueError(f"Each optimizer of parameter `optimizers` should be 'paddle.optimizer.Optimizer' type, " | |||
f"not {type(each_optimizer)}.") |
@@ -0,0 +1,176 @@ | |||
import os | |||
import sys | |||
import __main__ | |||
import tempfile | |||
import copy | |||
from typing import List | |||
from fastNLP.core.drivers.utils import distributed_open_proc | |||
from fastNLP.envs.env import ( | |||
FASTNLP_DISTRIBUTED_CHECK, | |||
FASTNLP_LOG_LEVEL, | |||
FASTNLP_GLOBAL_SEED, | |||
USER_CUDA_VISIBLE_DEVICES, | |||
) | |||
from .utils import ( | |||
find_free_ports, | |||
reset_seed, | |||
) | |||
# 记录各个进程信息 | |||
class SubTrainer(object): | |||
""" | |||
和fastnlp的Triainer没有关系,仅用于统计节点内不同训练的一些信息 | |||
""" | |||
def __init__(self, endpoint=None, rank=None): | |||
self.devices = [] | |||
self.endpoint = endpoint | |||
self.rank = rank | |||
class FleetLauncher: | |||
""" | |||
复原了 paddle 的 launch_collective 函数,将其集成到一个类里 | |||
仅支持单机多卡的启动 | |||
""" | |||
def __init__( | |||
self, | |||
devices: List[int], | |||
output_from_new_proc: str = "only_error" | |||
): | |||
self.devices = devices | |||
self.output_from_new_proc = output_from_new_proc | |||
self.setup() | |||
def setup(self): | |||
self.set_endpoints() | |||
self.sub_trainers = self.get_process_info() | |||
def launch(self) -> int: | |||
# 设置环境变量 | |||
self.global_envs = self.get_global_env() | |||
self.open_subprocess() | |||
reset_seed() | |||
def open_subprocess(self): | |||
if __main__.__spec__ is None: | |||
# Script called as `python a/b/c.py` | |||
# when user is using hydra find the absolute path | |||
path_lib = os.path.abspath | |||
# pull out the commands used to run the script and resolve the abs file path | |||
command = sys.argv | |||
try: | |||
full_path = path_lib(command[0]) | |||
except Exception: | |||
full_path = os.path.abspath(command[0]) | |||
command[0] = full_path | |||
# use the same python interpreter and actually running | |||
command = [sys.executable] + command | |||
else: # Script called as `python -m a.b.c` | |||
command = [sys.executable, "-m", __main__.__spec__._name] + sys.argv[1:] | |||
current_env = copy.copy(self.global_envs) | |||
for idx, t in enumerate(self.sub_trainers): | |||
proc_env = { | |||
# global_rank | |||
"PADDLE_TRAINER_ID": f"{t.rank}", | |||
"PADDLE_CURRENT_ENDPOINT": f"{t.endpoint}", | |||
# rank | |||
"PADDLE_RANK_IN_NODE": f"{idx}", | |||
"PADDLE_LOCAL_DEVICE_IDS": | |||
",".join([str(g) for g in t.devices]), | |||
} | |||
if len(t.devices) > 0: | |||
proc_env["FLAGS_selected_gpus"] = "%s" % ",".join( | |||
[str(g) for g in t.devices]) | |||
proc_env["FLAGS_selected_devices"] = "%s" % ",".join( | |||
[str(g) for g in t.devices]) | |||
current_env.update(proc_env) | |||
if os.environ.get(FASTNLP_GLOBAL_SEED) is None and FASTNLP_GLOBAL_SEED in current_env: | |||
del current_env[FASTNLP_GLOBAL_SEED] | |||
if idx != 0: | |||
# 子进程 | |||
if os.environ.get(FASTNLP_LOG_LEVEL, None) is None: | |||
current_env[FASTNLP_LOG_LEVEL] = "warning" | |||
proc = distributed_open_proc(self.output_from_new_proc, command, current_env, t.rank) | |||
else: | |||
# 更新当前的环境变量 | |||
os.environ.update(current_env) | |||
def get_global_env(self): | |||
global_envs = copy.copy(os.environ.copy()) | |||
self.gloo_rendezvous_dir = tempfile.mkdtemp() | |||
# launch中涉及的gloo环境 | |||
global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) | |||
global_envs["PADDLE_GLOO_RENDEZVOUS"] = "3" | |||
global_envs["PADDLE_GLOO_FS_PATH"] = self.gloo_rendezvous_dir | |||
global_envs["PADDLE_DISTRI_BACKEND"] = "nccl" | |||
# 通过FNLP初始化的标志 | |||
global_envs[FASTNLP_DISTRIBUTED_CHECK] = f"{','.join([str(g) for g in self.devices])}" | |||
# 统计全局信息 | |||
device_ids = [] | |||
for t in self.sub_trainers: | |||
device_ids.append([str(acc) for acc in t.devices]) | |||
world_device_ids = [':'.join(ele) for ele in device_ids] | |||
# 全局环境变量 | |||
global_envs.update({ | |||
# world_size | |||
"PADDLE_TRAINERS_NUM": f"{len(self.sub_trainers)}", | |||
"PADDLE_TRAINER_ENDPOINTS": ",".join(self.endpoints), | |||
"PADDLE_WORLD_DEVICE_IDS": ",".join(world_device_ids), | |||
}) | |||
return global_envs | |||
def set_endpoints(self): | |||
""" | |||
Reference to `get_cluster_from_args` | |||
""" | |||
self.node_ip = "127.0.0.1" | |||
free_ports = None | |||
if os.environ.get("FLAGS_START_PORT") is None: | |||
free_ports = find_free_ports(len(self.devices)) | |||
if free_ports is not None: | |||
free_ports = list(free_ports) | |||
else: | |||
start_port = int(os.getenv("FLAGS_START_PORT", "6070")) | |||
free_ports = [ | |||
x for x in range(start_port, start_port + len(self.devices)) | |||
] | |||
self.endpoints = ["%s:%d" % (self.node_ip, port) for port in free_ports] | |||
def get_process_info(self): | |||
""" | |||
Reference to `get_cluster` | |||
""" | |||
sub_trainers = [] | |||
assert len(self.endpoints) >= len( | |||
self.devices | |||
), "current trainer_endpoints size should be greater equal than acclerators size." | |||
for i in range(len(self.devices)): | |||
sub_trainer = SubTrainer(f"{self.endpoints[i]}", i) | |||
if isinstance(self.devices[i], (list, tuple)): | |||
sub_trainer.devices.extend(self.devices[i]) | |||
else: | |||
sub_trainer.devices.append(self.devices[i]) | |||
sub_trainers.append(sub_trainer) | |||
return sub_trainers |
@@ -0,0 +1,87 @@ | |||
import os | |||
from typing import Optional, List, Sequence, Union | |||
from .paddle_driver import PaddleDriver | |||
from .single_device import PaddleSingleDriver | |||
from .fleet import PaddleFleetDriver | |||
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
from fastNLP.envs.env import FASTNLP_DISTRIBUTED_CHECK | |||
from fastNLP.core.log import logger | |||
if _NEED_IMPORT_PADDLE: | |||
import paddle | |||
def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[int]]], | |||
model: paddle.nn.Layer, **kwargs) -> PaddleDriver: | |||
r""" | |||
用来根据参数 `driver` 和 `device` 来确定并且初始化一个具体的 `Driver` 实例然后返回回去; | |||
注意如果输入的 `device` 如果和 `driver` 对应不上就直接报错; | |||
:param driver: 该参数的值应为以下之一:["paddle", "fleet"]; | |||
:param device: 该参数的格式与 `Trainer` 对参数 `device` 的要求一致; | |||
:param model: 训练或者评测的具体的模型; | |||
:return: 返回一个元组,元组的第一个值是具体的基于 pytorch 的 `Driver` 实例,元组的第二个值是该 driver 的名字(用于检测一个脚本中 | |||
先后 driver 的次序的正确问题); | |||
""" | |||
if "PADDLE_TRAINERS_NUM" in os.environ and "PADDLE_RANK_IN_NODE" in os.environ and FASTNLP_DISTRIBUTED_CHECK not in os.environ: | |||
if device is not None: | |||
logger.warning("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " | |||
"up your script. And we will directly get the local device via " | |||
"`f'gpu:{os.environ['FLAGS_selected_gpus']}')`.") | |||
device = [int(g) for g in os.environ["FLAGS_selected_gpus"].split(",")] | |||
return PaddleFleetDriver(model, f"gpu:{os.environ['PADDLE_RANK_IN_NODE']}", True, **kwargs) | |||
if driver not in {"paddle", "fleet"}: | |||
raise ValueError("Parameter `driver` can only be one of these values: ['paddle', 'fleet'].") | |||
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") | |||
user_visible_devices = os.getenv("USER_CUDA_VISIBLE_DEVICES") | |||
# 优先级 user > cuda | |||
# 判断单机情况 device 的合法性 | |||
# 分布式情况下通过 world_device 判断 | |||
if user_visible_devices is not None: | |||
_could_use_device_num = len(user_visible_devices.split(",")) | |||
elif cuda_visible_devices is not None: | |||
_could_use_device_num = len(cuda_visible_devices.split(",")) | |||
else: | |||
_could_use_device_num = paddle.device.cuda.device_count() | |||
if isinstance(device, int): | |||
if device < 0 and device != -1: | |||
raise ValueError("Parameter `device` can only be '-1' when it is smaller than 0.") | |||
if device >= _could_use_device_num: | |||
raise ValueError("The gpu device that parameter `device` specifies is not existed.") | |||
device = f"gpu:{device}" | |||
elif isinstance(device, Sequence) and not isinstance(device, str): | |||
device = list(set(device)) | |||
for each in device: | |||
if not isinstance(each, int): | |||
raise ValueError("When parameter `device` is 'Sequence' type, the value in it should be 'int' type.") | |||
elif each < 0: | |||
raise ValueError("When parameter `device` is 'Sequence' type, the value in it should be bigger than 0.") | |||
if len(device) == 1: | |||
# 传入了 [1] 这样的,视为单卡。 | |||
device = device[0] | |||
elif device is not None and not isinstance(device, str): | |||
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") | |||
if driver == "paddle": | |||
if not isinstance(device, List): | |||
return PaddleSingleDriver(model, device, **kwargs) | |||
else: | |||
logger.warning("Notice you are using `paddle` driver but your chosen `device` are multi gpus, we will use" | |||
"`Fleetriver` by default. But if you mean using `PaddleFleetDriver`, you should choose parameter" | |||
"`driver` as `PaddleFleetDriver`.") | |||
return PaddleFleetDriver(model, device, **kwargs) | |||
elif driver == "fleet": | |||
if not isinstance(device, List): | |||
if device == "cpu": | |||
raise ValueError("You are using `fleet` driver, but your chosen `device` is 'cpu'.") | |||
logger.warning("Notice you are using `fleet` driver, but your chosen `device` is only one gpu, we will" | |||
"still use `PaddleFleetDriver` for you, but if you mean using `PaddleSingleDriver`, you should " | |||
"choose `paddle` driver.") | |||
return PaddleFleetDriver(model, device, **kwargs) | |||
else: | |||
return PaddleFleetDriver(model, device, **kwargs) |
@@ -0,0 +1,315 @@ | |||
import os | |||
import random | |||
from typing import Union, Optional, Callable, Dict | |||
from functools import partial | |||
import numpy as np | |||
from .utils import _build_fp16_env | |||
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
from fastNLP.core.drivers.driver import Driver | |||
from fastNLP.core.utils import apply_to_collection, paddle_move_data_to_device | |||
from fastNLP.envs import rank_zero_call | |||
from fastNLP.envs import FASTNLP_SEED_WORKERS | |||
from fastNLP.core.log import logger | |||
if _NEED_IMPORT_PADDLE: | |||
import paddle | |||
from paddle.io import DataLoader, IterableDataset | |||
from paddle.optimizer import Optimizer | |||
_reduces = { | |||
'max': paddle.max, | |||
'min': paddle.min, | |||
'mean': paddle.mean, | |||
'sum': paddle.sum | |||
} | |||
class PaddleDriver(Driver): | |||
r""" | |||
Paddle框架的Driver,包括实现单卡训练的`PaddleSingleDriver`和分布式训练的`PaddleFleetDriver`。 | |||
""" | |||
def __init__(self, model, fp16: Optional[bool] = False, **kwargs): | |||
if not isinstance(model, paddle.nn.Layer): | |||
raise ValueError(f"Parameter `model` can not be `{type(model)}` in `PaddleDriver`, it should be exactly " | |||
f"`paddle.nn.Layer` type.") | |||
super(PaddleDriver, self).__init__(model) | |||
self.fp16 = fp16 | |||
# scaler的参数 | |||
self.auto_cast, _grad_scaler = _build_fp16_env(dummy=not fp16) | |||
self.grad_scaler = _grad_scaler() | |||
def zero_grad(self, set_to_none: bool = False): | |||
r""" | |||
实现深度学习中的梯度的置零操作,应当直接通过优化器 optimizers 来将梯度置零; | |||
注意梯度累积不需要在这里实现,trainer 已经在内部实现了梯度累积; | |||
:param set_to_none: 用来判断是否需要将梯度直接置为 None;Paddle中这个参数无效。 | |||
""" | |||
# if set_to_none: | |||
# log.warning("Parameter `set_to_none` does nothing in paddle since grad cannot be set directly.") | |||
for optimizer in self.optimizers: | |||
optimizer.clear_grad() | |||
@staticmethod | |||
def _check_dataloader_legality(dataloader, dataloader_name, is_train: bool = False): | |||
r""" | |||
该函数会在 trainer 或者 evaluator 设置 dataloader 后检测 dataloader 的合法性。 | |||
要求传入的 dataloader 必须为 `paddle.io.DataLoader` 或包含该类型的字典。 | |||
:param dataloader: 需要检测的输入的 `dataloader`; | |||
:param dataloader_name: | |||
:param is_train: | |||
""" | |||
if is_train: | |||
if not isinstance(dataloader, DataLoader): | |||
raise ValueError(f"Parameter `{dataloader_name}` should be 'paddle.io.DataLoader' type, not {type(dataloader)}.") | |||
# TODO 我们先禁止 dataloader 的 dataset 是 IterableDataset 种类; | |||
if isinstance(dataloader.dataset, IterableDataset): | |||
raise TypeError("`IterableDataset` is not allowed.") | |||
else: | |||
if not isinstance(dataloader, Dict): | |||
raise ValueError(f"Parameter `{dataloader_name}` should be 'Dict' type, not {type(dataloader)}.") | |||
else: | |||
for each_dataloader in dataloader.values(): | |||
if not isinstance(each_dataloader, DataLoader): | |||
raise ValueError(f"Each dataloader of parameter `{dataloader_name}` should be 'paddle.io.DataLoader' " | |||
f"type, not {type(each_dataloader)}.") | |||
if isinstance(each_dataloader.dataset, IterableDataset): | |||
raise TypeError("`IterableDataset` is not allowed.") | |||
@staticmethod | |||
def _check_optimizer_legality(optimizers): | |||
r""" | |||
对于用户传入 trainer 的每一个 optimizer检测其合法性,必须为`paddle.optimizer.Optimizer`类型。 | |||
:param optimizers: 需要检测的 `optimizers`; | |||
""" | |||
for each_optimizer in optimizers: | |||
if not isinstance(each_optimizer, Optimizer): | |||
raise ValueError(f"Each optimizer of parameter `optimizers` should be 'paddle.optimizer.Optimizer' type, " | |||
f"not {type(each_optimizer)}.") | |||
def check_evaluator_mode(self, mode: str): | |||
r""" | |||
因为我们在具体的 driver 的 validate_step 和 test_step 的逻辑是如果模型没有实现本函数,那么就去检测模型是否实现了另一个函数; | |||
因此如果用户的 evaluator mode 是 validate,但是传入的 model 却没有实现 validate_step 函数,而是实现了 test_step 函数,那么 | |||
我们应当提醒用户这一行为; | |||
""" | |||
model = self.unwrap_model() | |||
if mode == "validate": | |||
if not hasattr(model, "validate_step"): | |||
if hasattr(model, "test_step"): | |||
logger.warning( | |||
"Your model does not have 'validate_step' method but has 'test_step' method, but you" | |||
"are using 'Evaluator.validate', we are going to use 'test_step' to substitute for" | |||
"'validate_step'.") | |||
else: | |||
if not hasattr(model, "test_step"): | |||
if hasattr(model, "validate_step"): | |||
logger.warning("Your model does not have 'test_step' method but has 'validate' method, but you" | |||
"are using 'Evaluator.test', we are going to use 'validate_step' to substitute for" | |||
"'test_step'.") | |||
@staticmethod | |||
def tensor_to_numeric(tensor, reduce=None): | |||
r""" | |||
将一个 `tensor` 对象(类型为 `paddle.Tensor` )转换为 python 的 `numeric` 对象;如果 tensor 只包含一个 | |||
元素则返回 float 或 int 。 | |||
:param tensor: 需要被转换的 `tensor` 对象 | |||
:param reduce: 可选 ['sum', 'max', 'mea', 'min'],如果不为 None 将使用该 reduce 方法来处理当前 tensor 再返回 | |||
float 或 int 对象。 | |||
:return: 转换后返回的结果 | |||
""" | |||
if tensor is None: | |||
return None | |||
def _translate(_data): | |||
# 如果只含有一个元素,则返回元素本身,而非list | |||
if _data.numel().item() == 1: | |||
return _data.item() | |||
if reduce is None: | |||
return _data.tolist() | |||
else: | |||
return _reduces[reduce](_data).item() | |||
return apply_to_collection( | |||
data=tensor, | |||
dtype=paddle.Tensor, | |||
function=_translate | |||
) | |||
def set_model_mode(self, mode: str): | |||
r""" | |||
设置模型为 `train` / `eval` 的模式;目的是为切换模型训练和推理(会关闭dropout等)模式; | |||
:param mode: 应为二者之一:["train", "eval"]; | |||
""" | |||
assert mode in {"train", "eval"} | |||
getattr(self.model, mode)() | |||
@rank_zero_call | |||
def save_model(self, filepath: str, only_state_dict: bool = True, model_save_fn: Optional[Callable]=None, **kwargs): | |||
r""" | |||
保存模型的函数;注意函数 `save` 是用来进行断点重训的函数; | |||
如果 `model_save_fn` 是一个可调用的函数,那么我们会直接运行该函数; | |||
:param filepath: 保存文件的文件位置(需要包括文件名); | |||
:param only_state_dict: 是否只保存模型的 `state_dict`;注意该参数仅当 `model_save_fn` 为 None 时有效; | |||
:param model_save_fn: 用户传入的用来代替该函数本身保存逻辑的函数;如果该参数不为 None,那么我们会调用 model_save_fn(path); | |||
""" | |||
if model_save_fn is not None: | |||
model_save_fn(filepath) | |||
else: | |||
model = self.unwrap_model() | |||
if only_state_dict: | |||
paddle.save(model.state_dict(), filepath) | |||
else: | |||
input_spec = kwargs.get("input_spec", None) | |||
if input_spec is None: | |||
raise Exception("To save the whole Paddle Layer, parameter 'input_spec' is needed.") | |||
paddle.jit.save(model, filepath, input_spec) | |||
@staticmethod | |||
@rank_zero_call | |||
def load_model(filepath: str, load_dict: bool = True): | |||
r""" | |||
加载模型的函数;注意函数 `load` 是用来进行断点重训的函数; | |||
:param filepath: 需要被加载的对象的文件位置(需要包括文件名); | |||
:param load_dict: 是否加载state_dict,默认为True。当用户在save_model时将only_state_dict设置为False时, | |||
即保存了整个模型时,这个参数必须也为False | |||
:return: 返回加载指定文件后的结果; | |||
""" | |||
if load_dict: | |||
return paddle.load(filepath) | |||
else: | |||
return paddle.jit.load(filepath) | |||
@rank_zero_call | |||
def save(self, folder, states: Dict): | |||
r""" | |||
断点重训的保存函数,该函数会负责保存模型和 optimizers 的 state_dict; | |||
需要注意 driver 应当是无状态的,即不管什么时候调用 driver 的接口函数,其返回的结果应该都是一样的;因此,断点重训不需要保存 driver | |||
本身自己的任何状态;而每一个 driver 实例需要在该函数中实现保存模型和 optimizers 的 state_dict 的逻辑;同时妥善存储传入的 | |||
states 中的内容(主要用于恢复 Trainer ,Callback 等) | |||
需要保证该函数只在 global rank 0 上运行 | |||
:param folder: 保存断点重训的状态的文件名; | |||
:param states: 由 trainer 传入的一个字典,其中已经包含了为了实现断点重训所需要保存的其它对象的状态,Driver 应该只需要保存 | |||
该对象即可, Driver 应该不需要理解该对象,同时在 driver.load() 的时候,需要将 states 返回回去,load()返回的值与这里的 | |||
传入的值保持一致。 | |||
""" | |||
# 1. 保存模型的状态; | |||
model = self.unwrap_model() | |||
model_state_dict = {name: param.cpu().detach().clone() for name, param in model.state_dict().items()} | |||
# 对于单卡的 driver 来讲,我们实际上(现在)不应该考虑用户在DDP环境下使用单卡模式,从而造成效率损失; | |||
states["model_state_dict"] = model_state_dict | |||
# 2. 保存 optimizers 的状态; | |||
optimizers_state_dict = {} | |||
for i in range(len(self.optimizers)): | |||
optimizer: Optimizer = self.optimizers[i] | |||
optimizer_state = optimizer.state_dict() | |||
optimizer_state = {name: param.cpu().detach().clone() for name, param in optimizer_state.items()} | |||
optimizers_state_dict[f"optimizer{i}"] = optimizer_state # 注意这里没有使用 deepcopy,测试是不需要的; | |||
states["optimizers_state_dict"] = optimizers_state_dict | |||
paddle.save(states, folder) | |||
def load(self, filepath) -> Dict: | |||
r""" | |||
断点重训的加载函数,注意该函数会负责读取数据,并且恢复模型和 optimizers 的 state_dict 等; | |||
driver 实例需要在该函数中先加载模型和 optimizers 的 state_dict,然后将一个 state 字典返回给 trainer 。 | |||
因此 save 函数和 load 函数的接受和返回值应该是对应的; | |||
该函数需要在所有 rank 上执行。 | |||
:param filepath: 保存断点重训的状态的文件名; | |||
:return: 需要返回 save 函数输入的 states 内容; | |||
""" | |||
states = paddle.load(filepath) | |||
# 1. 加载 optimizers 的状态; | |||
optimizers_state_dict = states["optimizers_state_dict"] | |||
for i in range(len(self.optimizers)): | |||
optimizer: paddle.optimizer.Optimizer = self.optimizers[i] | |||
optimizer.set_state_dict(optimizers_state_dict[f"optimizer{i}"]) | |||
# 2. 加载模型状态; | |||
model = self.unwrap_model() | |||
model.load_dict(states["model_state_dict"]) | |||
self.barrier() | |||
return states | |||
def get_evaluate_context(self): | |||
r""" | |||
返回一个不计算梯度的环境用来对模型进行评测; | |||
:return: context 上下文对象 `paddle.no_grad`; | |||
""" | |||
return paddle.no_grad | |||
@staticmethod | |||
def move_model_to_device(model: 'paddle.nn.Layer', device: Union[str, int, 'paddle.CUDAPlace', 'paddle.CPUPlace']): | |||
r""" | |||
用来将模型转移到指定的 device 上; | |||
在 Paddle 中使用可能会引起因与设置的设备不一致而产生的问题,请注意。 | |||
""" | |||
if device is not None: | |||
model.to(device) | |||
def move_data_to_device(self, batch: 'paddle.Tensor'): | |||
r""" | |||
将数据迁移到指定的机器上;batch 可能是 list 也可能 dict ,或其嵌套结构。 | |||
在 Paddle 中使用可能会引起因与设置的设备不一致而产生的问题,请注意。 | |||
:return: 将移动到指定机器上的 batch 对象返回; | |||
""" | |||
return paddle_move_data_to_device(batch, self.data_device) | |||
@staticmethod | |||
def worker_init_function(worker_id: int, rank: Optional[int] = None) -> None: # pragma: no cover | |||
"""The worker_init_fn that Lightning automatically adds to your dataloader if you previously set set the seed | |||
with ``seed_everything(seed, workers=True)``. | |||
See also the PyTorch documentation on | |||
`randomness in DataLoaders <https://pytorch.org/docs/stable/notes/randomness.html#dataloader>`_. | |||
""" | |||
# implementation notes: https://github.com/pytorch/pytorch/issues/5059#issuecomment-817392562 | |||
global_rank = rank if rank is not None else rank_zero_call.rank | |||
# TODO gpu | |||
process_seed = paddle.fluid.core.default_cpu_generator().initial_seed() | |||
# back out the base seed so we can use all the bits | |||
base_seed = process_seed - worker_id | |||
ss = np.random.SeedSequence([base_seed, worker_id, global_rank]) | |||
# use 128 bits (4 x 32-bit words) | |||
np.random.seed(ss.generate_state(4)) | |||
# Spawn distinct SeedSequences for the PyTorch PRNG and the stdlib random module | |||
paddle_ss, stdlib_ss = ss.spawn(2) | |||
paddle.seed(paddle_ss.generate_state(1, dtype=np.uint64)[0]) | |||
# use 128 bits expressed as an integer | |||
stdlib_seed = (stdlib_ss.generate_state(2, dtype=np.uint64).astype(object) * [1 << 64, 1]).sum() | |||
random.seed(stdlib_seed) | |||
def set_deterministic_dataloader(self, dataloader): | |||
r""" | |||
为了确定性训练要对 dataloader 进行修改,保证在确定随机数种子后,每次重新训练得到的结果是一样的; | |||
作用是替换 datalaoder 的 `worker_init_fn`。 | |||
""" | |||
if int(os.environ.get(FASTNLP_SEED_WORKERS, 0)) and dataloader.worker_init_fn is None: | |||
dataloader.worker_init_fn = partial(self.worker_init_function, rank=self.global_rank) | |||
def set_sampler_epoch(self, dataloader: 'DataLoader', cur_epoch_idx): | |||
r""" | |||
对于分布式的 sampler,dataloader 需要在每一个 epoch 前设置随机数种子,来保证每一个进程上的 shuffle 是一样的; | |||
:param cur_epoch_idx: 当前是第几个 epoch; | |||
""" | |||
if callable(getattr(dataloader.batch_sampler, "set_epoch", None)): | |||
dataloader.batch_sampler.set_epoch(cur_epoch_idx) |
@@ -0,0 +1,161 @@ | |||
from typing import Optional, Dict, Union | |||
from .paddle_driver import PaddleDriver | |||
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
from fastNLP.core.utils import auto_param_call, get_paddle_gpu_str | |||
from fastNLP.core.samplers import ReproducibleBatchSampler, ReproducibleIterator | |||
from fastNLP.core.log import logger | |||
if _NEED_IMPORT_PADDLE: | |||
import paddle | |||
from paddle.fluid.reader import _DatasetKind | |||
__all__ = [ | |||
"PaddleSingleDriver", | |||
] | |||
class PaddleSingleDriver(PaddleDriver): | |||
def __init__(self, model, device: Optional[str], fp16: Optional[bool] = False, **kwargs): | |||
super(PaddleSingleDriver, self).__init__(model, fp16=fp16, **kwargs) | |||
if device is None: | |||
raise ValueError("Parameter `device` can not be None in `PaddleSingleDriver`.") | |||
if isinstance(device, int): | |||
self.model_device = get_paddle_gpu_str(device) | |||
else: | |||
self.model_device = device | |||
self.local_rank = 0 | |||
self.global_rank = 0 | |||
self.world_size = 1 | |||
if isinstance(model, paddle.DataParallel): | |||
# 注意这里的 unwrap_model 调用的是具体子类的方法; | |||
model = self.unwrap_model() | |||
if hasattr(model, "train_step"): | |||
logger.warning("Notice your model is a `paddle.DataParallel` model. And your model also " | |||
"implements the `train_step` method, which we can not call actually, we will " | |||
" call `forward` function instead of `train_step` and you should note that.") | |||
self._train_step = self.model | |||
self._train_signature_fn = model.forward | |||
if hasattr(model, "validate_step"): | |||
logger.warning("Notice your model is a `paddle.DataParallel` model. And your model also " | |||
"implements the `validate_step` method, which we can not call actually, we " | |||
"will call `forward` function instead of `validate_step` and you should note that.") | |||
self._validate_step = self.model | |||
self._validate_signature_fn = model.forward | |||
if hasattr(model, "test_step"): | |||
logger.warning("Notice your model is a `paddle.DataParallel` model. And your model also " | |||
"implements the `test_step` method, which we can not call actually, we will " | |||
"call `forward` function instead of `test_step` and you should note that.") | |||
self._test_step = self.model | |||
self._test_signature_fn = model.forward | |||
else: | |||
if hasattr(self.model, "train_step"): | |||
self._train_step = self.model.train_step | |||
self._train_signature_fn = None | |||
else: | |||
self._train_step = self.model | |||
# 输入的模型是 `DataParallel`,我们需要保证其 signature_fn 是正确的; | |||
model = self.unwrap_model() | |||
self._train_signature_fn = model.forward | |||
if hasattr(self.model, "validate_step"): | |||
self._validate_step = self.model.validate_step | |||
self._validate_signature_fn = None | |||
elif hasattr(self.model, "test_step"): | |||
self._validate_step = self.model.test_step | |||
self._validate_signature_fn = self.model.test_step | |||
else: | |||
self._validate_step = self.model | |||
model = self.unwrap_model() | |||
self._validate_signature_fn = model.forward | |||
if hasattr(self.model, "test_step"): | |||
self._test_step = self.model.test_step | |||
self._test_signature_fn = None | |||
elif hasattr(self.model, "validate_step"): | |||
self._test_step = self.model.validate_step | |||
self._test_signature_fn = self.model.validate_step | |||
else: | |||
self._test_step = self.model | |||
model = self.unwrap_model() | |||
self._test_signature_fn = model.forward | |||
def setup(self): | |||
paddle.device.set_device(self.model_device) | |||
self.model.to(self.model_device) | |||
def train_step(self, batch) -> Dict: | |||
# 如果 batch 是一个 Dict,我们就默认帮其做参数匹配,否则就直接传入到 `train_step` 函数中,让用户自己处理; | |||
if isinstance(batch, Dict): | |||
return auto_param_call(self._train_step, batch, signature_fn=self._train_signature_fn) | |||
else: | |||
return self._train_step(batch) | |||
def backward(self, loss): | |||
self.grad_scaler.scale(loss).backward() | |||
def step(self): | |||
for optimizer in self.optimizers: | |||
self.grad_scaler.step(optimizer) | |||
self.grad_scaler.update() | |||
def validate_step(self, batch) -> Dict: | |||
if isinstance(batch, Dict): | |||
return auto_param_call(self._validate_step, batch, signature_fn=self._validate_signature_fn) | |||
else: | |||
return self._validate_step(batch) | |||
def test_step(self, batch) -> Dict: | |||
if isinstance(batch, Dict): | |||
return auto_param_call(self._test_step, batch, signature_fn=self._test_signature_fn) | |||
else: | |||
return self._test_step(batch) | |||
def replace_sampler(self, dataloader, dist_sampler: Union[str, ReproducibleBatchSampler, ReproducibleIterator], reproducible: bool = False): | |||
# 暂时不支持IteratorDataset | |||
assert dataloader.dataset_kind != _DatasetKind.ITER, \ | |||
"FastNLP does not support `IteratorDataset` now." | |||
if isinstance(dist_sampler, ReproducibleBatchSampler): | |||
dataloader.batch_sampler = dist_sampler | |||
return dataloader | |||
if isinstance(dist_sampler, ReproducibleIterator): | |||
dataloader.batch_sampler.sampler = dist_sampler | |||
return dataloader | |||
if reproducible: | |||
if isinstance(dataloader.batch_sampler.sampler, ReproducibleIterator): | |||
return dataloader | |||
elif isinstance(dataloader.batch_sampler, ReproducibleBatchSampler): | |||
return dataloader | |||
else: | |||
# TODO | |||
batch_sampler = ReproducibleBatchSampler( | |||
batch_sampler=dataloader.batch_sampler, | |||
batch_size=dataloader.batch_sampler.batch_size, | |||
drop_last=dataloader.drop_last | |||
) | |||
dataloader.batch_sampler = batch_sampler | |||
return dataloader | |||
else: | |||
return dataloader | |||
def unwrap_model(self): | |||
if isinstance(self.model, paddle.DataParallel): | |||
return self.model._layers | |||
else: | |||
return self.model | |||
@property | |||
def data_device(self): | |||
""" | |||
单卡模式不支持 data_device; | |||
""" | |||
return self.model_device | |||
def is_distributed(self): | |||
return False |
@@ -0,0 +1,351 @@ | |||
import socket | |||
import os | |||
import struct | |||
import random | |||
import inspect | |||
import numpy as np | |||
from contextlib import ExitStack, closing | |||
from enum import IntEnum | |||
from typing import Dict, Optional, Union | |||
from fastNLP.envs.imports import _NEED_IMPORT_PADDLE | |||
from fastNLP.core.utils import get_paddle_device_id, auto_param_call | |||
from fastNLP.envs.env import FASTNLP_GLOBAL_SEED, FASTNLP_SEED_WORKERS, USER_CUDA_VISIBLE_DEVICES | |||
from fastNLP.core.log import logger | |||
if _NEED_IMPORT_PADDLE: | |||
import paddle | |||
from paddle import nn | |||
from paddle.nn import Layer | |||
from paddle.io import DataLoader, BatchSampler | |||
from paddle.amp import auto_cast, GradScaler | |||
else: | |||
from fastNLP.core.utils.dummy_class import DummyClass as Layer | |||
__all__ = [ | |||
"paddle_seed_everything", | |||
] | |||
def _select_seed_randomly(min_seed_value: int = 0, max_seed_value: int = 255) -> int: | |||
return random.randint(min_seed_value, max_seed_value) | |||
def paddle_seed_everything(seed: Optional[int] = None, workers: bool = False) -> int: | |||
max_seed_value = np.iinfo(np.uint32).max | |||
min_seed_value = np.iinfo(np.uint32).min | |||
if seed is None: | |||
env_seed = os.environ.get("GLOBAL_SEED") | |||
if env_seed is None: | |||
seed = _select_seed_randomly(min_seed_value, max_seed_value) | |||
# rank_zero_warn(f"No seed found, seed set to {seed}") | |||
else: | |||
try: | |||
seed = int(env_seed) | |||
except ValueError: | |||
seed = _select_seed_randomly(min_seed_value, max_seed_value) | |||
# rank_zero_warn(f"Invalid seed found: {repr(env_seed)}, seed set to {seed}") | |||
elif not isinstance(seed, int): | |||
seed = int(seed) | |||
if not (min_seed_value <= seed <= max_seed_value): | |||
logger.warning("Your seed value is two big or two small for numpy, we will choose a random seed for " | |||
"you.") | |||
# rank_zero_warn(f"{seed} is not in bounds, numpy accepts from {min_seed_value} to {max_seed_value}") | |||
seed = _select_seed_randomly(min_seed_value, max_seed_value) | |||
# using `log.info` instead of `rank_zero_info`, | |||
# so users can verify the seed is properly set in distributed training. | |||
# log.info(f"Global seed set to {seed}") | |||
os.environ[FASTNLP_GLOBAL_SEED] = str(seed) | |||
random.seed(seed) | |||
np.random.seed(seed) | |||
# paddle的seed函数会自行判断是否在gpu环境,如果在的话会设置gpu的种子 | |||
paddle.seed(seed) | |||
os.environ[FASTNLP_SEED_WORKERS] = f"{int(workers)}" | |||
return seed | |||
def reset_seed() -> None: | |||
""" | |||
fleet 会开启多个进程,因此当用户在脚本中指定 seed_everything 时,在开启多个脚本后,会在每个脚本内重新 | |||
进行随机数的设置; | |||
""" | |||
seed = os.environ.get(FASTNLP_GLOBAL_SEED, None) | |||
workers = os.environ.get(FASTNLP_SEED_WORKERS, "0") | |||
if seed is not None: | |||
paddle_seed_everything(int(seed), workers=bool(int(workers))) | |||
class ForwardState(IntEnum): | |||
TRAIN = 0 | |||
VALIDATE = 1 | |||
TEST = 2 | |||
PREDICT = 3 | |||
_MODE_PARAMETER = "_forward_state" | |||
class _FleetWrappingModel(Layer): | |||
""" | |||
参考_DDPWrappingModel,paddle的分布式训练也需要用paddle.nn.DataParallel进行包装,采用和 | |||
pytorch相似的处理方式 | |||
""" | |||
def __init__(self, model: 'nn.Layer'): | |||
super(_FleetWrappingModel, self).__init__() | |||
self.model = model | |||
if isinstance(model, paddle.DataParallel): | |||
model = model._layers | |||
if hasattr(model, "train_step"): | |||
logger.warning( | |||
"Notice your model is a `paddle.DataParallel` model. And your " | |||
"model also implements the `train_step` method, which we can not call actually, we will" | |||
" call `forward` function instead of `train_step` and you should note that.") | |||
self._train_step = self.model | |||
self._train_signature_fn = model.forward | |||
if hasattr(model, "validate_step"): | |||
logger.warning( | |||
"Notice your model is a `paddle.DataParallel` model. And your " | |||
"model also implements the `validate_step` method, which we can not call actually, " | |||
"we will call `forward` function instead of `validate_step` and you should note that.") | |||
self._validate_step = self.model | |||
self._validate_signature_fn = model.forward | |||
if hasattr(model, "test_step"): | |||
logger.warning( | |||
"Notice your model is a `paddle.DataParallel` model. And your " | |||
"model also implements the `test_step` method, which we can not call actually, we will" | |||
" call `forward` function instead of `test_step` and you should note that.") | |||
self._test_step = self.model | |||
self._test_signature_fn = model.forward | |||
else: | |||
if hasattr(model, "train_step"): | |||
self._train_step = model.train_step | |||
self._train_signature_fn = None | |||
else: | |||
self._train_step = model | |||
self._train_signature_fn = model.forward | |||
if hasattr(model, "validate_step"): | |||
self._validate_step = model.validate_step | |||
self._validate_signature_fn = None | |||
elif hasattr(model, "test_step"): | |||
self._validate_step = model.test_step | |||
self._validate_signature_fn = None | |||
else: | |||
self._validate_step = model | |||
self._validate_signature_fn = model.forward | |||
if hasattr(model, "test_step"): | |||
self._test_step = model.test_step | |||
self._test_signature_fn = None | |||
elif hasattr(model, "validate_step"): | |||
self._test_step = model.validate_step | |||
self._test_signature_fn = None | |||
else: | |||
self._test_step = model | |||
self._test_signature_fn = model.forward | |||
def forward(self, batch, **kwargs) -> Dict: | |||
_forward_state = kwargs.pop(_MODE_PARAMETER) | |||
if _forward_state == ForwardState.TRAIN: | |||
if isinstance(batch, Dict): | |||
return auto_param_call(self._train_step, batch, signature_fn=self._train_signature_fn) | |||
else: | |||
return self._train_step(batch) | |||
elif _forward_state == ForwardState.VALIDATE: | |||
if isinstance(batch, Dict): | |||
return auto_param_call(self._validate_step, batch, signature_fn=self._validate_signature_fn) | |||
else: | |||
return self._validate_step(batch) | |||
elif _forward_state == ForwardState.TEST: | |||
if isinstance(batch, Dict): | |||
return auto_param_call(self._test_step, batch, signature_fn=self._test_signature_fn) | |||
else: | |||
return self._test_step(batch) | |||
elif _forward_state == ForwardState.PREDICT: | |||
raise NotImplementedError("'PREDICT' mode has not been implemented.") | |||
else: | |||
raise NotImplementedError("You should direct a concrete mode.") | |||
class DummyGradScaler: | |||
""" | |||
用于仿造的GradScaler对象,防止重复写大量的if判断 | |||
""" | |||
def __init__(self, *args, **kwargs): | |||
pass | |||
def get_scale(self): | |||
return 1.0 | |||
def is_enabled(self): | |||
return False | |||
def scale(self, outputs): | |||
return outputs | |||
def step(self, optimizer, *args, **kwargs): | |||
optimizer.step(*args, **kwargs) | |||
def update(self, new_scale=None): | |||
pass | |||
def unscale_(self, optimizer): | |||
pass | |||
def load_state_dict(self, state_dict): | |||
pass | |||
def state_dict(self): | |||
return {} | |||
def _build_fp16_env(dummy=False): | |||
if dummy: | |||
auto_cast = ExitStack | |||
GradScaler = DummyGradScaler | |||
else: | |||
if not paddle.device.is_compiled_with_cuda(): | |||
raise RuntimeError("No cuda") | |||
if paddle.device.cuda.get_device_capability(0)[0] < 7: | |||
logger.warning( | |||
"NOTE: your device does NOT support faster training with fp16, " | |||
"please switch to FP32 which is likely to be faster" | |||
) | |||
return auto_cast, GradScaler | |||
def find_free_ports(num): | |||
def __free_port(): | |||
with closing(socket.socket(socket.AF_INET, socket.SOCK_STREAM)) as s: | |||
s.setsockopt(socket.SOL_SOCKET, socket.SO_LINGER, | |||
struct.pack('ii', 1, 0)) | |||
s.bind(('', 0)) | |||
return s.getsockname()[1] | |||
port_set = set() | |||
step = 0 | |||
while True: | |||
port = __free_port() | |||
if port not in port_set: | |||
port_set.add(port) | |||
if len(port_set) >= num: | |||
return port_set | |||
step += 1 | |||
if step > 400: | |||
logger.error( | |||
"can't find avilable port and use the specified static port now!" | |||
) | |||
return None | |||
return None | |||
def get_host_name_ip(): | |||
try: | |||
host_name = socket.gethostname() | |||
host_ip = socket.gethostbyname(host_name) | |||
return host_name, host_ip | |||
except: | |||
return None | |||
def get_device_from_visible(device: Union[str, int]): | |||
""" | |||
在有 CUDA_VISIBLE_DEVICES 的情况下,获取对应的设备。 | |||
如 CUDA_VISIBLE_DEVICES=2,3 ,device=3 ,则返回1。 | |||
:param devices:未转化的设备名 | |||
:return: 转化后的设备id | |||
""" | |||
if device == "cpu": | |||
return device | |||
cuda_visible_devices = os.getenv("CUDA_VISIBLE_DEVICES") | |||
idx = get_paddle_device_id(device) | |||
if cuda_visible_devices is None or cuda_visible_devices == "": | |||
# 这个判断一般不会发生,因为 fastnlp 会为 paddle 强行注入 CUDA_VISIBLE_DEVICES | |||
return idx | |||
else: | |||
# 利用 USER_CUDA_VISIBLDE_DEVICES 获取用户期望的设备 | |||
user_visiblde_devices = os.getenv(USER_CUDA_VISIBLE_DEVICES) | |||
if user_visiblde_devices is None or user_visiblde_devices != "": | |||
# 不为空,说明用户设置了 CUDA_VISIBLDE_DEVICES | |||
idx = user_visiblde_devices.split(",")[idx] | |||
else: | |||
idx = str(idx) | |||
cuda_visible_devices_list = cuda_visible_devices.split(',') | |||
assert idx in cuda_visible_devices_list, "Can't find "\ | |||
"your devices %s in CUDA_VISIBLE_DEVICES[%s]."\ | |||
% (idx, cuda_visible_devices) | |||
res = cuda_visible_devices_list.index(idx) | |||
return res | |||
def replace_sampler(dataloader: "DataLoader", sampler: "BatchSampler"): | |||
# 拿到实例属性; | |||
instance_attrs = {k: v for k, v in vars(dataloader).items() if not k.startswith('_')} | |||
# 拿到 dataloader '__init__' 函数的默认函数签名; | |||
init_params = dict(inspect.signature(dataloader.__init__).parameters) | |||
# 这里为什么要单独弄的原因在于,用户在定制自己的 dataloader 的同时可能为了方便只设定一些参数,而后面直接使用 **kwargs 的方式,这时如果 | |||
# 其在初始化自己的 dataloader 实例的时候加入了一些其它的新的参数(首先这一步是必要的,因为我们只能通过这样加 sampler;另一方面,用户 | |||
# 可能确实通过 **kwargs 加入了一些新的参数),如果假设用户是这样使用的: "super().__init__(**kwargs)",那么我们就只能去 DataLoader | |||
# 中寻找; | |||
has_variadic_kwargs = any(v.kind is v.VAR_KEYWORD for k, v in init_params.items()) | |||
if has_variadic_kwargs: | |||
init_params.update(dict(inspect.signature(DataLoader.__init__).parameters)) | |||
del init_params["self"] | |||
# 因为我们刚才可能用 DataLoader 的默认参数将用户定制的 dataloader 的参数覆盖掉了,因此需要重新弄一遍; | |||
non_default_params = {name for name, p in init_params.items() if | |||
name in instance_attrs and p.default != instance_attrs[name]} | |||
# add `dataset` as it might have been replaced with `*args` | |||
non_default_params.add("dataset") | |||
reconstruct_args = {k: v for k, v in instance_attrs.items() if k in non_default_params} | |||
reconstruct_args.update({"batch_sampler": sampler, "shuffle": False, "drop_last": False, "batch_size": 1}) | |||
required_args = { | |||
p.name | |||
for p in init_params.values() | |||
if p.kind in (p.POSITIONAL_ONLY, p.POSITIONAL_OR_KEYWORD) | |||
and p.default is p.empty | |||
and p.name not in reconstruct_args | |||
} | |||
# 这种错误针对的是 __init__ 中的参数没有用同样名字的 self 挂上; | |||
if required_args: | |||
required_args = sorted(required_args) | |||
dataloader_self_name = dataloader.__class__.__name__ | |||
raise Exception( | |||
f"Trying to inject `DistributedBatchSampler` into the `{dataloader_self_name}` instance. " | |||
"This would fail as some of the `__init__` arguments are not available as instance attributes. " | |||
f"The missing attributes are {required_args}. " | |||
f"HINT: If you wrote the `{dataloader_self_name}` class, define `self.missing_arg_name` or " | |||
"manually add the `DistributedBatchSampler` as: " | |||
f"`{dataloader_self_name}(dataset, sampler=DistributedBatchSampler(dataset))`." | |||
) | |||
# 这种错误针对的是传入的 dataloader 不是直接的 DataLoader,而是定制了 DataLoader,但是 __init__ 中没有 **kwargs; | |||
if not has_variadic_kwargs: | |||
# the dataloader signature does not allow keyword arguments that need to be passed | |||
missing_kwargs = reconstruct_args.keys() - init_params.keys() | |||
if missing_kwargs: | |||
missing_kwargs = sorted(missing_kwargs) | |||
dataloader_self_name = dataloader.__class__.__name__ | |||
raise Exception( | |||
f"Trying to inject `DistributedBatchSampler` into the `{dataloader_self_name}` instance. " | |||
"This would fail as it doesn't expose all its attributes in the `__init__` signature. " | |||
f"The missing arguments are {missing_kwargs}. " | |||
f"HINT: If you wrote the `{dataloader_self_name}` class, add the `__init__` arguments or " | |||
"manually add the `DistributedBatchSampler` as: " | |||
f"`{dataloader_self_name}(dataset, sampler=DistributedBatchSampler(dataset))`." | |||
) | |||
return type(dataloader)(**reconstruct_args) |