diff --git a/fastNLP/core/drivers/paddle_driver/fleet.py b/fastNLP/core/drivers/paddle_driver/fleet.py index 72dbd07d..f438599b 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet.py +++ b/fastNLP/core/drivers/paddle_driver/fleet.py @@ -207,12 +207,6 @@ class PaddleFleetDriver(PaddleDriver): raise NotImplementedError("FastNLP only support `collective` for distributed training now.") self.role_maker = self._fleet_kwargs.pop("role_maker", None) - if self.local_rank == 0 and not is_in_paddle_dist(): - # 由于使用driver时模型一定会被初始化,因此在一开始程序一定会占用一部分显存来存放模型,然而这部分显存没有 - # 发挥任何作用。 - logger.warning(f"The program will use some extra space on {paddle.device.get_device()} to place your model since the model " - "has already been initialized.") - self.output_from_new_proc = kwargs.get("output_from_new_proc", "only_error") assert isinstance(self.output_from_new_proc, str), "Parameter `output_from_new_proc` can only be `str` type." if self.output_from_new_proc not in {"all", "ignore", "only_error"}: diff --git a/fastNLP/core/drivers/paddle_driver/fleet_launcher.py b/fastNLP/core/drivers/paddle_driver/fleet_launcher.py index b53680cc..4df795ef 100644 --- a/fastNLP/core/drivers/paddle_driver/fleet_launcher.py +++ b/fastNLP/core/drivers/paddle_driver/fleet_launcher.py @@ -10,6 +10,7 @@ from fastNLP.envs.env import ( FASTNLP_DISTRIBUTED_CHECK, FASTNLP_LOG_LEVEL, FASTNLP_GLOBAL_SEED, + FASTNLP_GLOBAL_RANK ) from fastNLP.core.utils import get_paddle_device_id from .utils import ( @@ -130,6 +131,8 @@ class FleetLauncher: """ global_envs = copy.copy(os.environ.copy()) + os.environ[FASTNLP_GLOBAL_RANK] = "0" + self.gloo_rendezvous_dir = tempfile.mkdtemp() # launch中涉及的gloo环境 global_envs["PADDLE_WITH_GLOO"] = str(os.getenv("PADDLE_WITH_GLOO", "0")) diff --git a/fastNLP/core/metrics/backend/jittor_backend/backend.py b/fastNLP/core/metrics/backend/jittor_backend/backend.py index 44831a57..98f6d8c0 100644 --- a/fastNLP/core/metrics/backend/jittor_backend/backend.py +++ b/fastNLP/core/metrics/backend/jittor_backend/backend.py @@ -6,6 +6,7 @@ from fastNLP.core.metrics.backend import Backend if _NEED_IMPORT_JITTOR: import jittor +__all__ = [] class JittorBackend(Backend): diff --git a/fastNLP/core/metrics/backend/paddle_backend/backend.py b/fastNLP/core/metrics/backend/paddle_backend/backend.py index b8ea9cb0..29a901a2 100644 --- a/fastNLP/core/metrics/backend/paddle_backend/backend.py +++ b/fastNLP/core/metrics/backend/paddle_backend/backend.py @@ -4,23 +4,17 @@ from typing import List, Any import numpy as np from fastNLP.core.metrics.backend import Backend -from fastNLP.core.utils.paddle_utils import paddle_to, _convert_data_device +from fastNLP.core.utils.paddle_utils import paddle_to, _convert_data_device, is_in_paddle_dist from fastNLP.core.metrics.utils import AggregateMethodError from fastNLP.core.drivers.paddle_driver.dist_utils import fastnlp_paddle_all_gather from fastNLP.envs.imports import _NEED_IMPORT_PADDLE -from fastNLP.envs.env import USER_CUDA_VISIBLE_DEVICES if _NEED_IMPORT_PADDLE: import paddle import paddle.distributed as dist from paddle.fluid.dygraph import parallel_helper - -def _simple_gather_all_tensors(result, group: Any, world_size: int) -> List: - gathered_result = [paddle.zeros_like(result) for _ in range(world_size)] - dist.all_gather(gathered_result, result, group) - return gathered_result - +__all__ = [] class PaddleBackend(Backend): def __init__(self): @@ -80,6 +74,13 @@ class PaddleBackend(Backend): else: raise ValueError(f"tensor: {tensor} can not convert to ndarray!") + @staticmethod + def is_distributed() -> bool: + """ + :return: + """ + return is_in_paddle_dist() + def move_tensor_to_device(self, tensor, device): device = _convert_data_device(device) return paddle_to(tensor, device) diff --git a/fastNLP/core/metrics/backend/torch_backend/backend.py b/fastNLP/core/metrics/backend/torch_backend/backend.py index a602434e..2badaa34 100644 --- a/fastNLP/core/metrics/backend/torch_backend/backend.py +++ b/fastNLP/core/metrics/backend/torch_backend/backend.py @@ -12,12 +12,7 @@ if _NEED_IMPORT_TORCH: import torch import torch.distributed as dist - -def _simple_gather_all_tensors(result, group: Any, world_size: int) -> List: - gathered_result = [torch.zeros_like(result) for _ in range(world_size)] - dist.all_gather(gathered_result, result, group) - return gathered_result - +__all__ = [] class TorchBackend(Backend): def __init__(self):