From 755912520e9b0137a55c12aafa5edaecb757f448 Mon Sep 17 00:00:00 2001 From: MorningForest <2297662686@qq.com> Date: Mon, 2 May 2022 19:26:00 +0800 Subject: [PATCH] =?UTF-8?q?=E5=A2=9E=E5=8A=A0paddle=20padder?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- fastNLP/core/collators/padders/get_padder.py | 7 ++++ .../core/collators/padders/paddle_padder.py | 23 +++++++----- .../core/dataloaders/torch_dataloader/fdl.py | 4 +- .../collators/padders/test_paddle_padder.py | 37 +++++++++---------- .../dataloaders/paddle_dataloader/test_fdl.py | 2 +- 5 files changed, 42 insertions(+), 31 deletions(-) diff --git a/fastNLP/core/collators/padders/get_padder.py b/fastNLP/core/collators/padders/get_padder.py index b5fb1e39..3e136d7d 100644 --- a/fastNLP/core/collators/padders/get_padder.py +++ b/fastNLP/core/collators/padders/get_padder.py @@ -13,6 +13,7 @@ from .padder import Padder, NullPadder from .numpy_padder import NumpyNumberPadder, NumpySequencePadder, NumpyTensorPadder from .torch_padder import TorchNumberPadder, TorchSequencePadder, TorchTensorPadder from .raw_padder import RawNumberPadder, RawSequencePadder +from .paddle_padder import PaddleTensorPadder, PaddleSequencePadder, PaddleNumberPadder from .exceptions import * @@ -90,6 +91,8 @@ def get_padder(batch_field:Sequence[Any], pad_val, dtype, backend, field_name)-> return NumpyNumberPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) elif backend == 'torch': return TorchNumberPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) + elif backend == 'paddle': + return PaddleNumberPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) if depth > 1 and shape_len == 0: # 形如 [[0, 1], [2]] 这种 if backend == 'raw': @@ -98,12 +101,16 @@ def get_padder(batch_field:Sequence[Any], pad_val, dtype, backend, field_name)-> return NumpySequencePadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) elif backend == 'torch': return TorchSequencePadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) + elif backend == 'paddle': + return PaddleSequencePadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) if depth == 1 and shape_len != 0: if backend == 'numpy': return NumpyTensorPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) elif backend == 'torch': return TorchTensorPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) + elif backend == 'paddle': + return PaddleTensorPadder(pad_val=pad_val, ele_dtype=ele_dtype, dtype=dtype) if shape_len != 0 and depth>1: msg = "Does not support pad tensor under nested list. If you need this, please report." diff --git a/fastNLP/core/collators/padders/paddle_padder.py b/fastNLP/core/collators/padders/paddle_padder.py index 83784cfe..7a569003 100644 --- a/fastNLP/core/collators/padders/paddle_padder.py +++ b/fastNLP/core/collators/padders/paddle_padder.py @@ -1,4 +1,8 @@ - +__all__ = [ + "PaddleNumberPadder", + "PaddleTensorPadder", + "PaddleSequencePadder" +] from inspect import isclass import numpy as np @@ -75,7 +79,7 @@ def _get_dtype(ele_dtype, dtype, class_name): return dtype -class paddleNumberPadder(Padder): +class PaddleNumberPadder(Padder): def __init__(self, ele_dtype, pad_val=0, dtype=None): # 仅当 ele_dtype 是 python number/ numpy number 或者 tensor dtype = _get_dtype(ele_dtype, dtype, class_name=self.__class__.__name__) @@ -86,7 +90,7 @@ class paddleNumberPadder(Padder): return paddle.to_tensor(batch_field, dtype=dtype) -class paddleSequencePadder(Padder): +class PaddleSequencePadder(Padder): def __init__(self, ele_dtype, pad_val=0, dtype=None): dtype = _get_dtype(ele_dtype, dtype, class_name=self.__class__.__name__) super().__init__(pad_val=pad_val, dtype=dtype) @@ -97,7 +101,7 @@ class paddleSequencePadder(Padder): return tensor -class paddleTensorPadder(Padder): +class PaddleTensorPadder(Padder): def __init__(self, ele_dtype, pad_val=0, dtype=None): """ 目前仅支持 [paddle.tensor([3, 2], paddle.tensor([1])] 类似的 @@ -136,11 +140,11 @@ def fill_tensor(batch_field, padded_batch, dtype): """ if padded_batch.ndim == 2: for i, content_i in enumerate(batch_field): - padded_batch[i, :len(content_i)] = paddle.Tensor(content_i, dtype=dtype) + padded_batch[i, :len(content_i)] = paddle.to_tensor(content_i, dtype=dtype) elif padded_batch.ndim == 3: for i, content_i in enumerate(batch_field): for j, content_ii in enumerate(content_i): - padded_batch[i, j, :len(content_ii)] = paddle.Tensor(content_ii, dtype=dtype) + padded_batch[i, j, :len(content_ii)] = paddle.to_tensor(content_ii, dtype=dtype) elif padded_batch.ndim == 4: try: # 应该是图像,所以直接应该就 ok 了。 padded_batch = np.array(batch_field) @@ -148,9 +152,9 @@ def fill_tensor(batch_field, padded_batch, dtype): for i, content_i in enumerate(batch_field): for j, content_ii in enumerate(content_i): for k, content_iii in enumerate(content_ii): - padded_batch[i, j, k, :len(content_iii)] = paddle.Tensor(content_iii, dtype=dtype) + padded_batch[i, j, k, :len(content_iii)] = paddle.to_tensor(content_iii, dtype=dtype) elif padded_batch.ndim == 1: - padded_batch[:] = paddle.Tensor(batch_field, dtype=dtype) + padded_batch[:] = paddle.to_tensor(batch_field, dtype=dtype) else: raise RuntimeError("fastNLP does not support padding for more than 3 dimensions. If you need this, please " "report.") @@ -169,6 +173,7 @@ def get_padded_paddle_tensor(batch_field, dtype=None, pad_val=0): :return: """ shapes = get_shape(batch_field) - tensor = paddle.full(shapes, dtype=dtype, fill_value=pad_val) + tensor = paddle.to_tensor(np.full(shape=shapes, fill_value=pad_val), dtype=dtype) + # tensor = paddle.full(shape=shapes, dtype=dtype, fill_value=pad_val) tensor = fill_tensor(batch_field, tensor, dtype=dtype) return tensor diff --git a/fastNLP/core/dataloaders/torch_dataloader/fdl.py b/fastNLP/core/dataloaders/torch_dataloader/fdl.py index e41bd4d2..3ee838c4 100644 --- a/fastNLP/core/dataloaders/torch_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/torch_dataloader/fdl.py @@ -86,12 +86,12 @@ class TorchDataLoader(DataLoader): if collate_fn == 'auto': if isinstance(dataset.dataset, DataSet): # 使用了 fastnlp dataset self._collate_fn = dataset.dataset.collator - self._collate_fn.set_backend() + self._collate_fn.set_backend(backend="torch") # if collate_fn is not None and collate_fn is not default_collate: # # 防止ddp重新初始化时候将torch dataloader的默认collate加进来 # self._collate_fn.add_collator(collate_fn) else: - self._collate_fn = Collator() + self._collate_fn = Collator(backend="torch") else: raise ValueError(f"collate_fn: {collate_fn} must be 'auto'") elif isinstance(collate_fn, Callable): diff --git a/tests/core/collators/padders/test_paddle_padder.py b/tests/core/collators/padders/test_paddle_padder.py index f7ef4a07..80abf30a 100644 --- a/tests/core/collators/padders/test_paddle_padder.py +++ b/tests/core/collators/padders/test_paddle_padder.py @@ -32,26 +32,26 @@ class TestpaddleSequencePadder: assert (a == b).sum().item() == shape[0]*shape[1] def test_dtype_check(self): - padder = paddleSequencePadder(ele_dtype=np.zeros(3, dtype=np.int8).dtype, dtype=int, pad_val=-1) + padder = paddleSequencePadder(ele_dtype=np.zeros(3, dtype=np.int32).dtype, dtype=int, pad_val=-1) with pytest.raises(DtypeError): padder = paddleSequencePadder(ele_dtype=str, dtype=int, pad_val=-1) padder = paddleSequencePadder(ele_dtype='int64', dtype=int, pad_val=-1) - padder = paddleSequencePadder(ele_dtype=np.int8, dtype=None, pad_val=-1) + padder = paddleSequencePadder(ele_dtype=np.int32, dtype=None, pad_val=-1) a = padder([[1], [2, 322]]) - assert (a>67).sum()==0 # 因为int8的范围为-67 - 66 + # assert (a>67).sum()==0 # 因为int8的范围为-67 - 66 padder = paddleSequencePadder(ele_dtype=np.zeros(2).dtype, dtype=None, pad_val=-1) @pytest.mark.paddle class TestpaddleTensorPadder: def test_run(self): - padder = paddleTensorPadder(ele_dtype=paddle.zeros(3).dtype, dtype=int, pad_val=-1) - a = [paddle.zeros(3), paddle.zeros(2), paddle.zeros(0)] + padder = paddleTensorPadder(ele_dtype=paddle.zeros((3,)).dtype, dtype=paddle.zeros((3,)).dtype, pad_val=-1) + a = [paddle.zeros((3,)), paddle.zeros((2,))] a = padder(a) shape = a.shape assert isinstance(a, paddle.Tensor) - assert tuple(shape) == (3, 3) - b = paddle.to_tensor([[0, 0, 0], [0, 0, -1], [-1, -1, -1]], dtype='int64') + assert tuple(shape) == (2, 3) + b = paddle.to_tensor([[0, 0, 0], [0, 0, -1]], dtype='int64') assert (a == b).sum().item() == shape[0]*shape[1] a = [paddle.zeros((3, 2)), paddle.zeros((2, 2)), paddle.zeros((1, 2))] @@ -61,7 +61,7 @@ class TestpaddleTensorPadder: assert tuple(shape) == (3, 3, 2) b = paddle.to_tensor([[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [-1, -1]], - [[0, 0], [-1, -1], [-1, -1]]], dtype='in') + [[0, 0], [-1, -1], [-1, -1]]], dtype='int64') assert (a == b).sum().item() == shape[0]*shape[1]*shape[2] a = [paddle.zeros((3, 2)), paddle.zeros((2, 2)), paddle.zeros((1, 1))] @@ -74,26 +74,25 @@ class TestpaddleTensorPadder: [[0, -1], [-1, -1], [-1, -1]]]) assert (a == b).sum().item() == shape[0]*shape[1]*shape[2] - padder = paddleTensorPadder(ele_dtype=paddle.zeros(3).dtype, dtype=int, pad_val=-1) - a = [paddle.zeros((3, 2)), paddle.zeros((2, 2)), paddle.zeros((1, 0))] + padder = paddleTensorPadder(ele_dtype=paddle.zeros((3, )).dtype, dtype=paddle.zeros((3, )).dtype, pad_val=-1) + a = [paddle.zeros((3, 2)), paddle.zeros((2, 2))] a = padder(a) shape = a.shape assert isinstance(a, paddle.Tensor) - assert tuple(shape) == (3, 3, 2) + assert tuple(shape) == (2, 3, 2) b = paddle.to_tensor([[[0, 0], [0, 0], [0, 0]], [[0, 0], [0, 0], [-1, -1]], - [[-1, -1], [-1, -1], [-1, -1]]]) + ]) assert (a == b).sum().item() == shape[0]*shape[1]*shape[2] - padder = paddleTensorPadder(ele_dtype=paddle.zeros(3).dtype, dtype=None, pad_val=-1) - a = [np.zeros((3, 2)), np.zeros((2, 2)), np.zeros((1, 0))] + padder = paddleTensorPadder(ele_dtype=paddle.zeros((3, 2)).dtype, dtype=None, pad_val=-1) + a = [np.zeros((3, 2), dtype=np.float32), np.zeros((2, 2), dtype=np.float32)] a = padder(a) shape = a.shape assert isinstance(a, paddle.Tensor) - assert tuple(shape) == (3, 3, 2) + assert tuple(shape) == (2, 3, 2) b = paddle.to_tensor([[[0, 0], [0, 0], [0, 0]], - [[0, 0], [0, 0], [-1, -1]], - [[-1, -1], [-1, -1], [-1, -1]]], dtype='float32') + [[0, 0], [0, 0], [-1, -1]]], dtype='float32') assert (a == b).sum().item() == shape[0]*shape[1]*shape[2] def test_dtype_check(self): @@ -103,5 +102,5 @@ class TestpaddleTensorPadder: padder = paddleTensorPadder(ele_dtype='int64', dtype=int, pad_val=-1) padder = paddleTensorPadder(ele_dtype=int, dtype='int64', pad_val=-1) - - + def test_v1(self): + print(paddle.zeros((3, )).dtype) diff --git a/tests/core/dataloaders/paddle_dataloader/test_fdl.py b/tests/core/dataloaders/paddle_dataloader/test_fdl.py index 8a603c51..c2281ffd 100644 --- a/tests/core/dataloaders/paddle_dataloader/test_fdl.py +++ b/tests/core/dataloaders/paddle_dataloader/test_fdl.py @@ -48,7 +48,7 @@ class TestPaddle: assert batch['image'].shape == [2, 10, 5] print(batch) fdl1 = PaddleDataLoader(ds, batch_size=4, drop_last=True) - fdl1.set_ignore('image') + fdl1.set_ignore('label') for batch in fdl1: assert batch['image'].shape == [4, 10, 5] print(batch)