1. 在 PaddleBackend 中添加 is_distributed 函数 2. 在 paddle 多卡的主进程添加 FASTNLP_GLOBAL_RANK

3 years ago · 06743caa1a
--- a/fastNLP/core/drivers/paddle_driver/fleet.py
+++ b/fastNLP/core/drivers/paddle_driver/fleet.py
@@ -207,12 +207,6 @@ class PaddleFleetDriver(PaddleDriver):
            raise NotImplementedError("FastNLP only support `collective` for distributed training now.")
        self.role_maker = self._fleet_kwargs.pop("role_maker", None)
        if self.local_rank == 0 and not is_in_paddle_dist():
            # 由于使用driver时模型一定会被初始化，因此在一开始程序一定会占用一部分显存来存放模型，然而这部分显存没有
            # 发挥任何作用。
            logger.warning(f"The program will use some extra space on {paddle.device.get_device()} to place your model since the model "
                            "has already been initialized.")
        self.output_from_new_proc = kwargs.get("output_from_new_proc", "only_error")
        assert isinstance(self.output_from_new_proc, str), "Parameter `output_from_new_proc` can only be `str` type."
        if self.output_from_new_proc not in {"all", "ignore", "only_error"}:
--- a/fastNLP/core/drivers/paddle_driver/fleet_launcher.py
+++ b/fastNLP/core/drivers/paddle_driver/fleet_launcher.py
@@ -10,6 +10,7 @@ from fastNLP.envs.env import (
    FASTNLP_DISTRIBUTED_CHECK,
    FASTNLP_LOG_LEVEL,
    FASTNLP_GLOBAL_SEED,
    FASTNLP_GLOBAL_RANK
 )
 from fastNLP.core.utils import get_paddle_device_id
 from .utils import (
@@ -130,6 +131,8 @@ class FleetLauncher:
        """
        global_envs = copy.copy(os.environ.copy())
        os.environ[FASTNLP_GLOBAL_RANK] = "0"
        self.gloo_rendezvous_dir = tempfile.mkdtemp()
        # launch中涉及的gloo环境
        global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0"))
--- a/fastNLP/core/metrics/backend/jittor_backend/backend.py
+++ b/fastNLP/core/metrics/backend/jittor_backend/backend.py
@@ -6,6 +6,7 @@ from fastNLP.core.metrics.backend import Backend
 if _NEED_IMPORT_JITTOR:
    import jittor
 __all__ = []
 class JittorBackend(Backend):
--- a/fastNLP/core/metrics/backend/paddle_backend/backend.py
+++ b/fastNLP/core/metrics/backend/paddle_backend/backend.py
@@ -4,23 +4,17 @@ from typing import List, Any
 import numpy as np
 from fastNLP.core.metrics.backend import Backend
 from fastNLP.core.utils.paddle_utils import paddle_to, _convert_data_device
 from fastNLP.core.utils.paddle_utils import paddle_to, _convert_data_device, is_in_paddle_dist
 from fastNLP.core.metrics.utils import AggregateMethodError
 from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather
 from fastNLP.envs.imports import _NEED_IMPORT_PADDLE
 from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES
 if _NEED_IMPORT_PADDLE:
    import paddle
    import paddle.distributed as dist
    from paddle.fluid.dygraph import parallel_helper
 def _simple_gather_all_tensors(result, group: Any, world_size: int) -> List:
    gathered_result = [paddle.zeros_like(result) for _ in range(world_size)]
    dist.all_gather(gathered_result, result, group)
    return gathered_result
 __all__ = []
 class PaddleBackend(Backend):
    def __init__(self):
@@ -80,6 +74,13 @@ class PaddleBackend(Backend):
        else:
            raise ValueError(f"tensor: {tensor} can not convert to ndarray!")
    @staticmethod
    def is_distributed() -> bool:
        """
        :return:
        """
        return is_in_paddle_dist()
    def move_tensor_to_device(self, tensor, device):
        device = _convert_data_device(device)
        return paddle_to(tensor, device)
--- a/fastNLP/core/metrics/backend/torch_backend/backend.py
+++ b/fastNLP/core/metrics/backend/torch_backend/backend.py
@@ -12,12 +12,7 @@ if _NEED_IMPORT_TORCH:
    import torch
    import torch.distributed as dist
 def _simple_gather_all_tensors(result, group: Any, world_size: int) -> List:
    gathered_result = [torch.zeros_like(result) for _ in range(world_size)]
    dist.all_gather(gathered_result, result, group)
    return gathered_result
 __all__ = []
 class TorchBackend(Backend):
    def __init__(self):