Browse Source

paddle分布式训练

tags/v1.0.0alpha
x54-729 2 years ago
parent
commit
680421fe86
5 changed files with 57 additions and 59 deletions
  1. +23
    -27
      fastNLP/core/drivers/paddle_driver/fleet.py
  2. +6
    -5
      fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py
  3. +2
    -4
      fastNLP/core/utils/paddle_utils.py
  4. +1
    -21
      tests/helpers/datasets/paddle_data.py
  5. +25
    -2
      tests/helpers/models/paddle_model.py

+ 23
- 27
fastNLP/core/drivers/paddle_driver/fleet.py View File

@@ -8,7 +8,6 @@ from .utils import (
_FleetWrappingModel, _FleetWrappingModel,
ForwardState, ForwardState,
_MODE_PARAMETER, _MODE_PARAMETER,
get_host_name_ip,
get_device_from_visible, get_device_from_visible,
reset_seed, reset_seed,
) )
@@ -81,9 +80,9 @@ class PaddleFleetDriver(PaddleDriver):


# 如果用户自己在外面初始化了并行模型; # 如果用户自己在外面初始化了并行模型;
self.outside_fleet = False self.outside_fleet = False
# 检测 paddle 分布式的环境变量
if parallel_helper._is_parallel_ctx_initialized():
# 如果用户自己在外面初始化了 DDP,那么我们要求用户传入的模型一定是已经由 DistributedDataParallel 包裹后的模型;
if parallel_helper._is_parallel_ctx_initialized() and FASTNLP_DISTRIBUTED_CHECK not in os.environ and \
"fastnlp_paddle_launch_not_fleet" not in os.environ:
# 如果用户自己在外面初始化了 Fleet,那么我们要求用户传入的模型一定是已经由 DistributedDataParallel 包裹后的模型;
if not isinstance(model, DataParallel): if not isinstance(model, DataParallel):
raise RuntimeError( raise RuntimeError(
"It is not allowed to input a normal model instead of `paddle.DataParallel` when" "It is not allowed to input a normal model instead of `paddle.DataParallel` when"
@@ -125,11 +124,11 @@ class PaddleFleetDriver(PaddleDriver):
self._test_step = partial(_running_fn_, step_fn=self.model, signature_fn=model.forward) self._test_step = partial(_running_fn_, step_fn=self.model, signature_fn=model.forward)


# 当参数 `device` 为 None 时并且该参数不为 None,表示将对应的数据移到指定的机器上; # 当参数 `device` 为 None 时并且该参数不为 None,表示将对应的数据移到指定的机器上;
self._data_device = kwargs.get("_data_device", None)
self._data_device = kwargs.get("data_device", None)
if self._data_device is not None: if self._data_device is not None:
if isinstance(self._data_device, int): if isinstance(self._data_device, int):
if self._data_device < 0: if self._data_device < 0:
raise ValueError("Parameter `_data_device` can not be smaller than 0.")
raise ValueError("Parameter `data_device` can not be smaller than 0.")
_could_use_device_num = paddle.device.cuda.device_count() _could_use_device_num = paddle.device.cuda.device_count()
if self._data_device >= _could_use_device_num: if self._data_device >= _could_use_device_num:
raise ValueError("The gpu device that parameter `device` specifies is not existed.") raise ValueError("The gpu device that parameter `device` specifies is not existed.")
@@ -140,18 +139,6 @@ class PaddleFleetDriver(PaddleDriver):
logger.warning("`Parameter data_device` is not equal to paddle.deivce.get_device(), " logger.warning("`Parameter data_device` is not equal to paddle.deivce.get_device(), "
"please keep them equal to avoid some potential bugs.") "please keep them equal to avoid some potential bugs.")


if not self.outside_fleet and parallel_device is None:
raise ValueError("Parameter `parallel_device` can not be None when using `PaddleFleetDriver`. This error is caused "
"when your value of parameter `device` is `None` in your `Trainer` instance.")

# 可能需要放在参数里
self.strategy = kwargs.get("strategy", fleet.DistributedStrategy())
self.is_collective = kwargs.get("is_collective", True)
if not self.is_collective:
raise NotImplementedError("FastNLP dose not support `parameters server` for distributed training now.")
self.role_maker = kwargs.get("role_maker", None)

self._master_port = None
self.world_size = None self.world_size = None
self.global_rank = 0 self.global_rank = 0
self._configured = False # 防止重复调用 configure_ddp() 函数使用 self._configured = False # 防止重复调用 configure_ddp() 函数使用
@@ -159,7 +146,11 @@ class PaddleFleetDriver(PaddleDriver):


self._fleet_kwargs = kwargs.get("paddle_fleet_kwargs", {}) self._fleet_kwargs = kwargs.get("paddle_fleet_kwargs", {})
check_user_specific_params(self._fleet_kwargs, DataParallel.__init__) check_user_specific_params(self._fleet_kwargs, DataParallel.__init__)
# TODO 对这些参数的检查
self.strategy = self._fleet_kwargs.get("strategy", fleet.DistributedStrategy())
self.is_collective = self._fleet_kwargs.get("is_collective", True)
if not self.is_collective:
raise NotImplementedError("FastNLP only support `collective` for distributed training now.")
self.role_maker = self._fleet_kwargs.get("role_maker", None)


if self.local_rank == 0 and not is_in_paddle_dist(): if self.local_rank == 0 and not is_in_paddle_dist():
# 由于使用driver时模型一定会被初始化,因此在一开始程序一定会占用一部分显存来存放模型,然而这部分显存没有 # 由于使用driver时模型一定会被初始化,因此在一开始程序一定会占用一部分显存来存放模型,然而这部分显存没有
@@ -193,14 +184,16 @@ class PaddleFleetDriver(PaddleDriver):
self.world_size = int(os.environ.get("PADDLE_TRAINERS_NUM")) self.world_size = int(os.environ.get("PADDLE_TRAINERS_NUM"))
self.global_rank = int(os.environ.get("PADDLE_TRAINER_ID")) self.global_rank = int(os.environ.get("PADDLE_TRAINER_ID"))
reset_seed() reset_seed()
logger.warning(f"\nworld size, global rank: {self.world_size}, {self.global_rank}\n")
fleet.init(self.role_maker, self.is_collective, self.strategy)
logger.info(f"\nworld size, global rank: {self.world_size}, {self.global_rank}\n")
if not parallel_helper._is_parallel_ctx_initialized():
fleet.init(self.role_maker, self.is_collective, self.strategy)

os.environ["fastnlp_paddle_launch_not_fleet"] = "yes"


else: else:
# 在用户只使用了一个分布式 trainer 的情况下 # 在用户只使用了一个分布式 trainer 的情况下
# 此时 parallel_helper._is_parallel_ctx_initialized() 一定为 False # 此时 parallel_helper._is_parallel_ctx_initialized() 一定为 False
# parallel_device 是 list, # parallel_device 是 list,
# if self.local_rank == 0 and FASTNLP_DISTRIBUTED_CHECK not in os.environ:
if not parallel_helper._is_parallel_ctx_initialized(): if not parallel_helper._is_parallel_ctx_initialized():
# 没有初始化分布式环境,且是主进程 # 没有初始化分布式环境,且是主进程
self.init_fleet_and_set() self.init_fleet_and_set()
@@ -212,11 +205,15 @@ class PaddleFleetDriver(PaddleDriver):
if sorted(pre_gpus) != sorted(self.parallel_device): if sorted(pre_gpus) != sorted(self.parallel_device):
raise RuntimeError("Notice you are using `PaddleFleetDriver` after one instantiated `PaddleFleetDriver`, it is not" raise RuntimeError("Notice you are using `PaddleFleetDriver` after one instantiated `PaddleFleetDriver`, it is not"
"allowed that your second `PaddleFleetDriver` has a new setting of parameters `parallel_device`.") "allowed that your second `PaddleFleetDriver` has a new setting of parameters `parallel_device`.")
self.world_size = dist.get_world_size()
self.global_rank = dist.get_rank()


if not self.outside_fleet: if not self.outside_fleet:
# self.model.to(self.model_device) # self.model.to(self.model_device)
self.configure_fleet() self.configure_fleet()


self.barrier()

# 初始化 self._pids,从而使得每一个进程都能接受到 rank0 的 send 操作; # 初始化 self._pids,从而使得每一个进程都能接受到 rank0 的 send 操作;
# TODO 不用.to会怎么样? # TODO 不用.to会怎么样?
self._pids = [] self._pids = []
@@ -238,6 +235,7 @@ class PaddleFleetDriver(PaddleDriver):
""" """
if self.local_rank == 0: if self.local_rank == 0:
# 是 rank0 的话,则拉起其它子进程 # 是 rank0 的话,则拉起其它子进程
print("in launcher")
launcher = FleetLauncher(self.parallel_device, self.output_from_new_proc) launcher = FleetLauncher(self.parallel_device, self.output_from_new_proc)
launcher.launch() launcher.launch()
# 设置参数和初始化分布式环境 # 设置参数和初始化分布式环境
@@ -255,6 +253,7 @@ class PaddleFleetDriver(PaddleDriver):
当用户使用了 `python -m paddle.distributed.launch xxx.py` 启动时,我们需要 当用户使用了 `python -m paddle.distributed.launch xxx.py` 启动时,我们需要
根据 paddle 设置的环境变量来获得各种属性 根据 paddle 设置的环境变量来获得各种属性
""" """
print("set_from_env")
self.world_size = dist.get_world_size() self.world_size = dist.get_world_size()
self.global_rank = dist.get_rank() self.global_rank = dist.get_rank()


@@ -296,8 +295,6 @@ class PaddleFleetDriver(PaddleDriver):


@property @property
def model_device(self): def model_device(self):
# 我认为这里的两个 device 应该返回真实值,对 CUDA_VISIBLDE_DEIVCES的转换应该在相应的 to 函数完成
# 否则会造成用户的困惑
return self._model_device return self._model_device


@property @property
@@ -407,9 +404,8 @@ class PaddleFleetDriver(PaddleDriver):


def move_data_to_device(self, batch: 'paddle.Tensor'): def move_data_to_device(self, batch: 'paddle.Tensor'):
device = self.data_device device = self.data_device
# 因为设置了CUDA_VISIBLE_DEVICES,在子进程中可能会引起错误
if FASTNLP_DISTRIBUTED_CHECK in os.environ:
device = get_device_from_visible(device)
# 因为设置了CUDA_VISIBLE_DEVICES,可能会引起错误
device = get_device_from_visible(device)
return paddle_move_data_to_device(batch, device) return paddle_move_data_to_device(batch, device)


@staticmethod @staticmethod


+ 6
- 5
fastNLP/core/drivers/paddle_driver/initialize_paddle_driver.py View File

@@ -7,7 +7,7 @@ from .single_device import PaddleSingleDriver
from .fleet import PaddleFleetDriver from .fleet import PaddleFleetDriver


from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs.env import FASTNLP_DISTRIBUTED_CHECK
from fastNLP.core.utils import is_in_paddle_launch_dist
from fastNLP.core.log import logger from fastNLP.core.log import logger


if _NEED_IMPORT_PADDLE: if _NEED_IMPORT_PADDLE:
@@ -26,13 +26,14 @@ def initialize_paddle_driver(driver: str, device: Optional[Union[str, int, List[
:return: 返回一个元组,元组的第一个值是具体的基于 pytorch 的 `Driver` 实例,元组的第二个值是该 driver 的名字(用于检测一个脚本中 :return: 返回一个元组,元组的第一个值是具体的基于 pytorch 的 `Driver` 实例,元组的第二个值是该 driver 的名字(用于检测一个脚本中
先后 driver 的次序的正确问题); 先后 driver 的次序的正确问题);
""" """
if "PADDLE_TRAINERS_NUM" in os.environ and "PADDLE_RANK_IN_NODE" in os.environ and FASTNLP_DISTRIBUTED_CHECK not in os.environ:
if is_in_paddle_launch_dist():
if device is not None: if device is not None:
logger.warning("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull " logger.warning("Parameter `device` would be ignored when you are using `paddle.distributed.launch` to pull "
"up your script. And we will directly get the local device via " "up your script. And we will directly get the local device via "
"`f'gpu:{os.environ['FLAGS_selected_gpus']}')`.")
device = [int(g) for g in os.environ["FLAGS_selected_gpus"].split(",")]
return PaddleFleetDriver(model, f"gpu:{os.environ['PADDLE_RANK_IN_NODE']}", True, **kwargs)
"and `os.environ['CUDA_VISIBLE_DEVICES']``.")
device = [int(g) for g in os.environ["CUDA_VISIBLE_DEVICES"].split(",")]
# TODO 目前一个进程仅对应一个卡,所以暂时传入一个 int
return PaddleFleetDriver(model, device[0], True, **kwargs)


if driver not in {"paddle", "fleet"}: if driver not in {"paddle", "fleet"}:
raise ValueError("Parameter `driver` can only be one of these values: ['paddle', 'fleet'].") raise ValueError("Parameter `driver` can only be one of these values: ['paddle', 'fleet'].")


+ 2
- 4
fastNLP/core/utils/paddle_utils.py View File

@@ -13,7 +13,7 @@ import re
from typing import Any, Optional, Union from typing import Any, Optional, Union


from fastNLP.envs.imports import _NEED_IMPORT_PADDLE from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK
from fastNLP.envs import FASTNLP_DISTRIBUTED_CHECK, FASTNLP_BACKEND_LAUNCH


if _NEED_IMPORT_PADDLE: if _NEED_IMPORT_PADDLE:
import paddle import paddle
@@ -94,6 +94,4 @@ def is_in_paddle_launch_dist():
""" """
判断是否处于 launch 启动的分布式进程中 判断是否处于 launch 启动的分布式进程中
""" """
return 'PADDLE_RANK_IN_NODE' in os.environ and \
'FLAGS_selected_gpus' in os.environ and \
FASTNLP_DISTRIBUTED_CHECK not in os.environ
return FASTNLP_BACKEND_LAUNCH in os.environ

+ 1
- 21
tests/helpers/datasets/paddle_data.py View File

@@ -15,7 +15,7 @@ class PaddleNormalDataset(Dataset):
return self._data[item] return self._data[item]




class PaddleRandomDataset(Dataset):
class PaddleRandomMaxDataset(Dataset):
def __init__(self, num_samples, num_features): def __init__(self, num_samples, num_features):
self.x = paddle.randn((num_samples, num_features)) self.x = paddle.randn((num_samples, num_features))
self.y = self.x.argmax(axis=-1) self.y = self.x.argmax(axis=-1)
@@ -25,23 +25,3 @@ class PaddleRandomDataset(Dataset):


def __getitem__(self, item): def __getitem__(self, item):
return {"x": self.x[item], "y": self.y[item]} return {"x": self.x[item], "y": self.y[item]}


class PaddleDataset_MNIST(Dataset):
def __init__(self, mode="train"):

self.dataset = [
(
np.array(img).astype('float32').reshape(-1),
label
) for img, label in paddle.vision.datasets.MNIST(mode=mode)
]

def __getitem__(self, idx):
return {"x": self.dataset[idx][0], "y": self.dataset[idx][1]}

def __len__(self):
return len(self.dataset)




+ 25
- 2
tests/helpers/models/paddle_model.py View File

@@ -1,12 +1,12 @@
import paddle import paddle
import paddle.nn as nn import paddle.nn as nn


class PaddleNormalModel_Classification(paddle.nn.Layer):
class PaddleNormalModel_Classification_1(paddle.nn.Layer):
""" """
基础的paddle分类模型 基础的paddle分类模型
""" """
def __init__(self, num_labels, feature_dimension): def __init__(self, num_labels, feature_dimension):
super(PaddleNormalModel_Classification, self).__init__()
super(PaddleNormalModel_Classification_1, self).__init__()
self.num_labels = num_labels self.num_labels = num_labels


self.linear1 = nn.Linear(in_features=feature_dimension, out_features=64) self.linear1 = nn.Linear(in_features=feature_dimension, out_features=64)
@@ -30,3 +30,26 @@ class PaddleNormalModel_Classification(paddle.nn.Layer):


x = self(x) x = self(x)
return {"pred": x, "target": y.reshape((-1,))} return {"pred": x, "target": y.reshape((-1,))}


class PaddleNormalModel_Classification_2(paddle.nn.Layer):
"""
基础的paddle分类模型,只实现 forward 函数测试用户自己初始化了分布式的场景
"""
def __init__(self, num_labels, feature_dimension):
super(PaddleNormalModel_Classification_2, self).__init__()
self.num_labels = num_labels

self.linear1 = nn.Linear(in_features=feature_dimension, out_features=64)
self.ac1 = nn.ReLU()
self.linear2 = nn.Linear(in_features=64, out_features=32)
self.ac2 = nn.ReLU()
self.output = nn.Linear(in_features=32, out_features=num_labels)
self.loss_fn = nn.CrossEntropyLoss()

def forward(self, x, y):
x = self.ac1(self.linear1(x))
x = self.ac2(self.linear2(x))
x = self.output(x)
loss = self.loss_fn(x, y)
return {"loss": self.loss_fn(x, y), "pred": x, "target": y.reshape((-1,))}

Loading…
Cancel
Save