@@ -14,12 +14,13 @@ from fastNLP.envs import( | |||
from fastNLP.envs.imports import _NEED_IMPORT_TORCH, _NEED_IMPORT_DEEPSPEED | |||
if _NEED_IMPORT_TORCH: | |||
import pytorch_lightning | |||
import torch | |||
import torch.distributed as dist | |||
from torch.optim import Optimizer | |||
if _NEED_IMPORT_DEEPSPEED: | |||
import deepspeed | |||
from deepspeed import DeepSpeedEngine, DeepSpeedOptimizer | |||
__all__ = [ | |||
"DeepSpeedDriver", | |||
@@ -33,7 +34,6 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
parallel_device: Union[List["torch.device"], "torch.device"], | |||
is_pull_by_torch_run = False, | |||
fp16: bool = False, | |||
strategy= "deepspeed", | |||
**kwargs | |||
): | |||
assert _NEED_IMPORT_DEEPSPEED, "Deepspeed is not imported." | |||
@@ -56,8 +56,22 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
# 我们的 model_device 一定是 torch.device,而不是一个 list; | |||
self.model_device = parallel_device[self.local_rank] | |||
# 暂时不允许在外面初始化 | |||
# 如果用户自己在外面初始化了 deepspeed; | |||
self.outside_ddp = False | |||
if dist.is_initialized() and FASTNLP_DISTRIBUTED_CHECK not in os.environ and \ | |||
"fastnlp_torch_launch_not_ddp" not in os.environ: | |||
# 如果用户自己在外面初始化了 deepspeed,那么我们要求用户传入的模型一定是已经由 DeepSpeedEngine 包裹后的模型; | |||
if not isinstance(model, DeepSpeedEngine): | |||
raise RuntimeError( | |||
"It is not allowed to input a normal model instead of `DeepSpeedEngine` when" | |||
"you initialize the ddp process out of our control.") | |||
self.outside_ddp = True | |||
self.config = model.config | |||
# 用户只有将模型上传到对应机器上后才能用 DistributedDataParallel 包裹,因此如果用户在外面初始化了 DDP,那么在 TorchDDPDriver 中 | |||
# 我们就直接将 model_device 置为 None; | |||
self.model_device = None | |||
self._data_device = kwargs.get("data_device", None) | |||
if isinstance(self._data_device, int): | |||
if self._data_device < 0: | |||
@@ -84,7 +98,6 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
self._has_setup = False # 设置这一参数是因为 evaluator 中也会进行 setup 操作,但是显然是不需要的也不应该的; | |||
self._has_ddpwrapped = False # 判断传入的模型是否经过 _has_ddpwrapped 包裹; | |||
self.strategy = strategy | |||
self.accumulation_steps = kwargs.get("accumulation_steps", 1) | |||
# 获取 batch_size 以设置 train_micro_batch_size_per_gpu 参数 | |||
train_dl = kwargs.get("train_dataloader", None) | |||
@@ -96,6 +109,14 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
self.train_micro_batch_size = 1 | |||
self._ds_kwargs = kwargs.get("deepspeed_kwargs", {}) | |||
self.strategy = self._ds_kwargs.get("strategy", "deepspeed") | |||
@staticmethod | |||
def _check_optimizer_legality(optimizers): | |||
for each_optimizer in optimizers: | |||
if not isinstance(each_optimizer, (Optimizer, DeepSpeedOptimizer)): | |||
raise TypeError(f"Each optimizer of parameter `optimizers` should be 'Optimizer' or " | |||
f"'DeepSpeedOptimizer'type, not {type(each_optimizer)}.") | |||
def setup(self): | |||
r""" | |||
@@ -112,15 +133,19 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
self.setup_config() | |||
# 如果用户需要使用多机模式,那么一定进入到这里; | |||
if self.is_pull_by_torch_run: | |||
# dist.get_world_size() 只能在 dist.init_process_group 初始化之后进行调用; | |||
self.world_size = int(os.environ.get("WORLD_SIZE")) | |||
self.global_rank = int(os.environ.get("RANK")) | |||
logger.info(f"World size: {self.world_size}, Global rank: {self.global_rank}") | |||
if self.outside_ddp: | |||
self.world_size = dist.get_world_size() | |||
self.global_rank = dist.get_rank() | |||
else: | |||
# dist.get_world_size() 只能在 dist.init_process_group 初始化之后进行调用; | |||
self.world_size = int(os.environ.get("WORLD_SIZE")) | |||
self.global_rank = int(os.environ.get("RANK")) | |||
logger.info(f"World size: {self.world_size}, Global rank: {self.global_rank}") | |||
if not dist.is_initialized(): | |||
deepspeed.init_distributed("nccl", distributed_port=self.master_port) | |||
if not dist.is_initialized(): | |||
deepspeed.init_distributed("nccl", distributed_port=self.master_port) | |||
os.environ["fastnlp_torch_launch_not_ddp"] = "yes" | |||
os.environ["fastnlp_torch_launch_not_ddp"] = "yes" | |||
# 进入到这里的情况时: | |||
# dist.is_initialized 一定为 False; | |||
@@ -146,8 +171,9 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
self.world_size = dist.get_world_size() | |||
self.global_rank = dist.get_rank() | |||
torch.cuda.set_device(self.model_device) | |||
self.configure_ddp() | |||
if not self.outside_ddp: | |||
torch.cuda.set_device(self.model_device) | |||
self.configure_ddp() | |||
self.barrier() | |||
# 初始化 self._pids,从而使得每一个进程都能接受到 rank0 的 send 操作; | |||
@@ -166,7 +192,7 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
def configure_ddp(self): | |||
# 设置 deepspeed | |||
if not isinstance(self.model, deepspeed.DeepSpeedEngine): | |||
if not isinstance(self.model, DeepSpeedEngine): | |||
model=_DeepSpeedWrappingModel(self.model, self.fp16) | |||
model_parameters = filter(lambda p: p.requires_grad, model.parameters()) | |||
self.model, ds_optimizer, _, _ = deepspeed.initialize( | |||
@@ -193,7 +219,6 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
self.config = self._ds_kwargs.get("config") | |||
if self.config is not None: | |||
# TODO 究竟哪些参数按照config,哪些按照trainer参数 | |||
logger.warn("Notice that you have defined a configuration for deepspeed and parameters like" | |||
"`optimizers`, `strategy` and `fp16` may not take effects.") | |||
return | |||
@@ -258,12 +283,6 @@ class DeepSpeedDriver(TorchDDPDriver): | |||
def step(self): | |||
self.model.step() | |||
def unwrap_model(self): | |||
r""" | |||
:return: 返回原本的模型; | |||
""" | |||
return self.model.module.model | |||
def get_model_no_sync_context(self): | |||
r""" | |||
:return: 返回一个 ``context`` 上下文环境,用于关闭各个进程之间的同步;在 ``deepspeed`` 中,返回一个空的上下文 | |||
@@ -38,6 +38,9 @@ def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.devi | |||
if driver == 'fairscale': | |||
return FairScaleDriver(model, torch.device(f"cuda:{os.environ['LOCAL_RANK']}"), | |||
is_pull_by_torch_run=True, **kwargs) | |||
elif kwargs.get("deepspeed_kwargs") is not None: | |||
return DeepSpeedDriver(model, torch.device(f"cuda:{os.environ['LOCAL_RANK']}"), | |||
is_pull_by_torch_run=True, **kwargs) | |||
else: | |||
return TorchDDPDriver(model, torch.device(f"cuda:{os.environ['LOCAL_RANK']}"), | |||
is_pull_by_torch_run=True, **kwargs) | |||
@@ -73,6 +76,14 @@ def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.devi | |||
raise ValueError("Parameter `device` is wrong type, please check our documentation for the right use.") | |||
if driver == "torch": # single, ddp, 直接启动。 | |||
if kwargs.get("deepspeed_kwargs") is not None: | |||
# 选择的是 deepspeed | |||
if not isinstance(device, List): | |||
if device.type == 'cpu': | |||
raise ValueError("You are using `deepspeed` driver, but your chosen `device` is 'cpu'.") | |||
logger.warning_once("Notice you are using `deepspeed`, but the `device` is only one gpu.") | |||
return DeepSpeedDriver(model, [device], **kwargs) | |||
return DeepSpeedDriver(model, device, **kwargs) | |||
if not isinstance(device, List): | |||
return TorchSingleDriver(model, device, **kwargs) | |||
else: | |||
@@ -84,11 +95,4 @@ def initialize_torch_driver(driver: str, device: Optional[Union[str, "torch.devi | |||
logger.warning_once("Notice you are using `fairscale`, but the `device` is only one gpu.") | |||
return FairScaleDriver(model, [device], **kwargs) | |||
else: | |||
return FairScaleDriver(model, device, **kwargs) | |||
elif driver == "deepspeed": | |||
if not isinstance(device, List): | |||
if device.type == 'cpu': | |||
raise ValueError("You are using `deepspeed` driver, but your chosen `device` is 'cpu'.") | |||
logger.warning_once("Notice you are using `deepspeed`, but the `device` is only one gpu.") | |||
return DeepSpeedDriver(model, [device], **kwargs) | |||
return DeepSpeedDriver(model, device, **kwargs) | |||
return FairScaleDriver(model, device, **kwargs) |
@@ -0,0 +1,95 @@ | |||
""" | |||
这个文件测试多卡情况下使用 deepspeed 的情况:: | |||
>>> # 测试直接使用多卡 | |||
>>> python _test_trainer_deepspeed.py | |||
>>> # 测试通过 deepspeed 拉起 | |||
>>> deepspeed _test_trainer_deepspeed.py | |||
""" | |||
import sys | |||
sys.path.append("../../../") | |||
from dataclasses import dataclass | |||
from fastNLP.core.controllers.trainer import Trainer | |||
from fastNLP.core.metrics.accuracy import Accuracy | |||
from fastNLP.core.callbacks.progress_callback import RichCallback | |||
from torch.optim import Adam | |||
from torch.utils.data import DataLoader | |||
from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 | |||
from tests.helpers.datasets.torch_data import TorchArgMaxDataset | |||
@dataclass | |||
class TrainDeepSpeedConfig: | |||
num_labels: int = 3 | |||
feature_dimension: int = 3 | |||
batch_size: int = 2 | |||
shuffle: bool = True | |||
evaluate_every = 2 | |||
def test_trainer_deepspeed( | |||
device, | |||
callbacks, | |||
strategy, | |||
config, | |||
n_epochs=2, | |||
): | |||
model = TorchNormalModel_Classification_1( | |||
num_labels=TrainDeepSpeedConfig.num_labels, | |||
feature_dimension=TrainDeepSpeedConfig.feature_dimension | |||
) | |||
optimizers = Adam(params=model.parameters(), lr=0.0001) | |||
train_dataloader = DataLoader( | |||
dataset=TorchArgMaxDataset(TrainDeepSpeedConfig.feature_dimension, 20), | |||
batch_size=TrainDeepSpeedConfig.batch_size, | |||
shuffle=True | |||
) | |||
val_dataloader = DataLoader( | |||
dataset=TorchArgMaxDataset(TrainDeepSpeedConfig.feature_dimension, 12), | |||
batch_size=TrainDeepSpeedConfig.batch_size, | |||
shuffle=True | |||
) | |||
train_dataloader = train_dataloader | |||
evaluate_dataloaders = val_dataloader | |||
evaluate_every = TrainDeepSpeedConfig.evaluate_every | |||
metrics = {"acc": Accuracy()} | |||
if config is not None: | |||
config["train_micro_batch_size_per_gpu"] = TrainDeepSpeedConfig.batch_size | |||
trainer = Trainer( | |||
model=model, | |||
driver="torch", | |||
device=device, | |||
optimizers=optimizers, | |||
train_dataloader=train_dataloader, | |||
evaluate_dataloaders=evaluate_dataloaders, | |||
evaluate_every=evaluate_every, | |||
metrics=metrics, | |||
output_mapping={"preds": "pred"}, | |||
n_epochs=n_epochs, | |||
callbacks=callbacks, | |||
deepspeed_kwargs={ | |||
"strategy": strategy, | |||
"config": config | |||
} | |||
) | |||
trainer.run() | |||
if __name__ == "__main__": | |||
device = [0,1] | |||
# device = [0,1,3] | |||
callbacks = [ | |||
# RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), | |||
RichCallback(5), | |||
] | |||
config = None | |||
test_trainer_deepspeed( | |||
device=device, | |||
callbacks=callbacks, | |||
strategy="deepspeed", | |||
config=config, | |||
n_epochs=5, | |||
) |
@@ -0,0 +1,105 @@ | |||
""" | |||
这个文件测试多卡情况下使用 deepspeed ,且用户自己调用了 deepspeed.initialize 的情况:: | |||
>>> deepspeed _test_trainer_deepspeed_outside.py | |||
""" | |||
import os | |||
import sys | |||
sys.path.append("../../../") | |||
from dataclasses import dataclass | |||
from fastNLP.core.controllers.trainer import Trainer | |||
from fastNLP.core.metrics.accuracy import Accuracy | |||
from fastNLP.core.callbacks.progress_callback import RichCallback | |||
from fastNLP.core.drivers.torch_driver.utils import _create_default_config | |||
import deepspeed | |||
import torch | |||
from torch.optim import Adam | |||
from torch.utils.data import DataLoader | |||
from tests.helpers.models.torch_model import TorchNormalModel_Classification_2 | |||
from tests.helpers.datasets.torch_data import TorchArgMaxDataset | |||
local_rank = int(os.environ["LOCAL_RANK"]) | |||
@dataclass | |||
class TrainDeepSpeedConfig: | |||
num_labels: int = 3 | |||
feature_dimension: int = 3 | |||
batch_size: int = 2 | |||
shuffle: bool = True | |||
evaluate_every = 2 | |||
def test_trainer_deepspeed( | |||
device, | |||
callbacks, | |||
strategy, | |||
config, | |||
n_epochs=2, | |||
): | |||
model = TorchNormalModel_Classification_2( | |||
num_labels=TrainDeepSpeedConfig.num_labels, | |||
feature_dimension=TrainDeepSpeedConfig.feature_dimension | |||
) | |||
optimizers = Adam(params=model.parameters(), lr=0.0001) | |||
train_dataloader = DataLoader( | |||
dataset=TorchArgMaxDataset(TrainDeepSpeedConfig.feature_dimension, 20), | |||
batch_size=TrainDeepSpeedConfig.batch_size, | |||
shuffle=True | |||
) | |||
val_dataloader = DataLoader( | |||
dataset=TorchArgMaxDataset(TrainDeepSpeedConfig.feature_dimension, 12), | |||
batch_size=TrainDeepSpeedConfig.batch_size, | |||
shuffle=True | |||
) | |||
train_dataloader = train_dataloader | |||
evaluate_dataloaders = val_dataloader | |||
evaluate_every = TrainDeepSpeedConfig.evaluate_every | |||
metrics = {"acc": Accuracy()} | |||
if config is not None: | |||
config["train_micro_batch_size_per_gpu"] = TrainDeepSpeedConfig.batch_size | |||
model, optimizers, _, _ = deepspeed.initialize( | |||
model=model, | |||
optimizer=optimizers, | |||
config=config, | |||
) | |||
trainer = Trainer( | |||
model=model, | |||
driver="torch", | |||
device=device, | |||
data_device=torch.device(f"cuda:{local_rank}"), | |||
optimizers=optimizers, | |||
train_dataloader=train_dataloader, | |||
evaluate_dataloaders=evaluate_dataloaders, | |||
evaluate_every=evaluate_every, | |||
metrics=metrics, | |||
output_mapping={"preds": "pred"}, | |||
n_epochs=n_epochs, | |||
callbacks=callbacks, | |||
deepspeed_kwargs={ | |||
"strategy": strategy, | |||
"config": config | |||
} | |||
) | |||
trainer.run() | |||
if __name__ == "__main__": | |||
device = [0,1] | |||
# device = [0,1,3] | |||
callbacks = [ | |||
# RecordMetricCallback(monitor="acc#acc", metric_threshold=0.0, larger_better=True), | |||
RichCallback(5), | |||
] | |||
config = _create_default_config(stage=2) | |||
test_trainer_deepspeed( | |||
device=device, | |||
callbacks=callbacks, | |||
strategy="deepspeed", | |||
config=config, | |||
n_epochs=5, | |||
) |
@@ -0,0 +1,99 @@ | |||
import pytest | |||
from dataclasses import dataclass | |||
from fastNLP.core.controllers.trainer import Trainer | |||
from fastNLP.core.metrics.accuracy import Accuracy | |||
from fastNLP.core.callbacks.progress_callback import RichCallback | |||
from fastNLP.core.drivers.torch_driver import DeepSpeedDriver | |||
from fastNLP.core.drivers.torch_driver.utils import _create_default_config | |||
from fastNLP.envs.imports import _NEED_IMPORT_TORCH | |||
if _NEED_IMPORT_TORCH: | |||
import torch | |||
from torch.optim import Adam | |||
from torch.utils.data import DataLoader | |||
from tests.helpers.models.torch_model import TorchNormalModel_Classification_1 | |||
from tests.helpers.datasets.torch_data import TorchArgMaxDataset | |||
from tests.helpers.utils import magic_argv_env_context | |||
@dataclass | |||
class TrainDeepSpeedConfig: | |||
num_labels: int = 3 | |||
feature_dimension: int = 3 | |||
batch_size: int = 2 | |||
shuffle: bool = True | |||
evaluate_every = 2 | |||
@pytest.mark.deepspeed | |||
class TestTrainer: | |||
@classmethod | |||
def setup_class(cls): | |||
# 不初始化的话从第二个测试例开始会因为环境变量报错。 | |||
torch_model = TorchNormalModel_Classification_1(1, 1) | |||
torch_opt = torch.optim.Adam(params=torch_model.parameters(), lr=0.01) | |||
device = [torch.device(i) for i in [0,1]] | |||
driver = DeepSpeedDriver( | |||
model=torch_model, | |||
parallel_device=device, | |||
) | |||
driver.set_optimizers(torch_opt) | |||
driver.setup() | |||
return driver | |||
@pytest.mark.parametrize("device", [[0, 1]]) | |||
@pytest.mark.parametrize("callbacks", [[RichCallback(5)]]) | |||
@pytest.mark.parametrize("strategy", ["deepspeed", "deepspeed_stage_1"]) | |||
@pytest.mark.parametrize("config", [None, _create_default_config(stage=1)]) | |||
@magic_argv_env_context | |||
def test_trainer_deepspeed( | |||
self, | |||
device, | |||
callbacks, | |||
strategy, | |||
config, | |||
n_epochs=2, | |||
): | |||
model = TorchNormalModel_Classification_1( | |||
num_labels=TrainDeepSpeedConfig.num_labels, | |||
feature_dimension=TrainDeepSpeedConfig.feature_dimension | |||
) | |||
optimizers = Adam(params=model.parameters(), lr=0.0001) | |||
train_dataloader = DataLoader( | |||
dataset=TorchArgMaxDataset(TrainDeepSpeedConfig.feature_dimension, 20), | |||
batch_size=TrainDeepSpeedConfig.batch_size, | |||
shuffle=True | |||
) | |||
val_dataloader = DataLoader( | |||
dataset=TorchArgMaxDataset(TrainDeepSpeedConfig.feature_dimension, 12), | |||
batch_size=TrainDeepSpeedConfig.batch_size, | |||
shuffle=True | |||
) | |||
train_dataloader = train_dataloader | |||
evaluate_dataloaders = val_dataloader | |||
evaluate_every = TrainDeepSpeedConfig.evaluate_every | |||
metrics = {"acc": Accuracy()} | |||
if config is not None: | |||
config["train_micro_batch_size_per_gpu"] = TrainDeepSpeedConfig.batch_size | |||
trainer = Trainer( | |||
model=model, | |||
driver="torch", | |||
device=device, | |||
optimizers=optimizers, | |||
train_dataloader=train_dataloader, | |||
evaluate_dataloaders=evaluate_dataloaders, | |||
evaluate_every=evaluate_every, | |||
metrics=metrics, | |||
output_mapping={"preds": "pred"}, | |||
n_epochs=n_epochs, | |||
callbacks=callbacks, | |||
deepspeed_kwargs={ | |||
"strategy": strategy, | |||
"config": config | |||
} | |||
) | |||
trainer.run() |