From 696f6b89f830baffefbeea1c71d1c354a057090e Mon Sep 17 00:00:00 2001 From: yh Date: Wed, 18 May 2022 21:49:47 +0800 Subject: [PATCH] =?UTF-8?q?=E4=BF=AE=E6=94=B9=E5=8F=82=E6=95=B0train=5Fbat?= =?UTF-8?q?ch=5Fsize?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- .../core/dataloaders/jittor_dataloader/fdl.py | 21 ++++++++++--------- .../core/dataloaders/paddle_dataloader/fdl.py | 21 ++++++++++--------- .../core/dataloaders/torch_dataloader/fdl.py | 21 ++++++++++--------- 3 files changed, 33 insertions(+), 30 deletions(-) diff --git a/fastNLP/core/dataloaders/jittor_dataloader/fdl.py b/fastNLP/core/dataloaders/jittor_dataloader/fdl.py index 0705866d..cee3cf3d 100644 --- a/fastNLP/core/dataloaders/jittor_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/jittor_dataloader/fdl.py @@ -185,7 +185,7 @@ class JittorDataLoader: return self.cur_batch_indices -def prepare_jittor_dataloader(ds_or_db, train_batch_size: int = 16, shuffle: bool = True, +def prepare_jittor_dataloader(ds_or_db, batch_size: int = 16, shuffle: bool = True, drop_last: bool = False, num_workers: int = 0, buffer_size: int = 512 * 1024 * 1024, stop_grad: bool = True, keep_numpy_array: bool = False, endless: bool = False, collate_fn: Union[None, str, Callable] = "auto", @@ -211,8 +211,9 @@ def prepare_jittor_dataloader(ds_or_db, train_batch_size: int = 16, shuffle: boo :param ds_or_db: 传进来的dataset集合或字典或为dataset或DataBundle。其取值只能为``[Dataset, DataBundle, Sequence[Dataset], Dict[name, Dataset]]``. - :param train_batch_size: 'train'数据集使用的batch_size,跟non_train_batch_size是互斥的。 - :param non_train_batch_size: 非'train'数据使用batch_size,跟train_batch_size是互斥的。 + :param batch_size: batch 的大小。 + :param non_train_batch_size: 如果传入的 ``ds_or_db`` 为 ``Dict`` 或 :class:`~fastNLP.io.DataBundle` 对象,可以通过改参数 + 设置名称不为 `train` 的其他 ``dataset`` 的 ``batch_size``。 :param shuffle: 是否打乱数据集 :param drop_last: 是否去掉最后一个不符合``batch_size``的数据 :param num_workers: 进程的数量,当``num_workers=0``时不开启多进程 @@ -234,7 +235,7 @@ def prepare_jittor_dataloader(ds_or_db, train_batch_size: int = 16, shuffle: boo """ from fastNLP.io.data_bundle import DataBundle if isinstance(ds_or_db, Dataset): - dl = JittorDataLoader(ds_or_db, batch_size=train_batch_size, shuffle=shuffle, + dl = JittorDataLoader(ds_or_db, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, buffer_size=buffer_size, stop_grad=stop_grad, keep_numpy_array=keep_numpy_array, endless=endless, collate_fn=collate_fn) @@ -243,7 +244,7 @@ def prepare_jittor_dataloader(ds_or_db, train_batch_size: int = 16, shuffle: boo dl_bundle = {} for name, ds in ds_or_db.iter_datasets(): if 'train' in name: - dl_bundle[name] = JittorDataLoader(ds_or_db, batch_size=train_batch_size, shuffle=shuffle, + dl_bundle[name] = JittorDataLoader(ds_or_db, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, buffer_size=buffer_size, stop_grad=stop_grad, keep_numpy_array=keep_numpy_array, @@ -251,7 +252,7 @@ def prepare_jittor_dataloader(ds_or_db, train_batch_size: int = 16, shuffle: boo collate_fn=collate_fn) else: dl_bundle[name] = JittorDataLoader(ds_or_db, - batch_size=non_train_batch_size if non_train_batch_size else train_batch_size, + batch_size=non_train_batch_size if non_train_batch_size else batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, buffer_size=buffer_size, @@ -263,8 +264,8 @@ def prepare_jittor_dataloader(ds_or_db, train_batch_size: int = 16, shuffle: boo ds_seq = [] for idx, ds in enumerate(ds_or_db): if idx > 0: - train_batch_size = non_train_batch_size if non_train_batch_size else train_batch_size - dl = JittorDataLoader(ds, batch_size=train_batch_size, shuffle=shuffle, + batch_size = non_train_batch_size if non_train_batch_size else batch_size + dl = JittorDataLoader(ds, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, buffer_size=buffer_size, stop_grad=stop_grad, keep_numpy_array=keep_numpy_array, endless=endless, collate_fn=collate_fn) @@ -275,13 +276,13 @@ def prepare_jittor_dataloader(ds_or_db, train_batch_size: int = 16, shuffle: boo ds_dict = {} for name, ds in ds_or_db.items(): if 'train' in name: - dl = JittorDataLoader(ds, batch_size=train_batch_size, shuffle=shuffle, + dl = JittorDataLoader(ds, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, buffer_size=buffer_size, stop_grad=stop_grad, keep_numpy_array=keep_numpy_array, endless=endless, collate_fn=collate_fn) else: dl = JittorDataLoader(ds_or_db, - batch_size=non_train_batch_size if non_train_batch_size else train_batch_size, + batch_size=non_train_batch_size if non_train_batch_size else batch_size, shuffle=shuffle, drop_last=drop_last, num_workers=num_workers, buffer_size=buffer_size, diff --git a/fastNLP/core/dataloaders/paddle_dataloader/fdl.py b/fastNLP/core/dataloaders/paddle_dataloader/fdl.py index db7bc47e..d37f0ed7 100644 --- a/fastNLP/core/dataloaders/paddle_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/paddle_dataloader/fdl.py @@ -253,7 +253,7 @@ class PaddleDataLoader(DataLoader): def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, return_list: bool = True, batch_sampler: Union["Sampler[Sequence[int]]", ReproducibleBatchSampler] = None, - train_batch_size: int = 16, shuffle: bool = False, + batch_size: int = 16, shuffle: bool = False, drop_last: bool = False, collate_fn: Union[Callable, str, None] = 'auto', num_workers: int = 0, use_buffer_reader: bool = True, use_shared_memory: bool = True, timeout: int = 0, @@ -280,8 +280,9 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, :param ds_or_db: 传进来的dataset集合或字典或为dataset或DataBundle。其取值只能为``[Dataset, DataBundle, Sequence[Dataset], Dict[name, Dataset]]``. - :param train_batch_size: 'train'数据集使用的batch_size,跟non_train_batch_size是互斥的。 - :param non_train_batch_size: 非'train'数据使用batch_size,跟train_batch_size是互斥的。 + :param batch_size: batch 的大小。 + :param non_train_batch_size: 如果传入的 ``ds_or_db`` 为 ``Dict`` 或 :class:`~fastNLP.io.DataBundle` 对象,可以通过改参数 + 设置名称不为 `train` 的其他 ``dataset`` 的 ``batch_size``。 :param feed_list: (list(Tensor)|tuple(Tensor)): feed Tensor list. The Tensors should be created by :code:`paddle.static.data()`. :attr:`feed_list` must be set if :attr:`return_list` is @@ -327,7 +328,7 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, from fastNLP.io.data_bundle import DataBundle if isinstance(ds_or_db, Dataset): dl = PaddleDataLoader(ds_or_db, feed_list=feed_list, places=places, return_list=return_list, - batch_sampler=batch_sampler, batch_size=train_batch_size, shuffle=shuffle, + batch_sampler=batch_sampler, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=collate_fn, num_workers=num_workers, use_shared_memory=use_shared_memory, use_buffer_reader=use_buffer_reader, timeout=timeout, worker_init_fn=worker_init_fn, persistent_workers=persistent_workers) @@ -338,7 +339,7 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, if 'train' in name: dl_bundle[name] = PaddleDataLoader(ds, feed_list=feed_list, places=places, return_list=return_list, - batch_sampler=batch_sampler, batch_size=train_batch_size, + batch_sampler=batch_sampler, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=collate_fn, num_workers=num_workers, use_shared_memory=use_shared_memory, @@ -349,7 +350,7 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, dl_bundle[name] = PaddleDataLoader(ds, feed_list=feed_list, places=places, return_list=return_list, batch_sampler=batch_sampler, - batch_size=non_train_batch_size if non_train_batch_size else train_batch_size, + batch_size=non_train_batch_size if non_train_batch_size else batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=collate_fn, num_workers=num_workers, use_shared_memory=use_shared_memory, @@ -361,9 +362,9 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, ds_seq = [] for idx, ds in enumerate(ds_or_db): if idx > 0: - train_batch_size = non_train_batch_size if non_train_batch_size else train_batch_size + batch_size = non_train_batch_size if non_train_batch_size else batch_size dl = PaddleDataLoader(ds, feed_list=feed_list, places=places, return_list=return_list, - batch_sampler=batch_sampler, batch_size=train_batch_size, shuffle=shuffle, + batch_sampler=batch_sampler, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=collate_fn, num_workers=num_workers, use_shared_memory=use_shared_memory, use_buffer_reader=use_buffer_reader, timeout=timeout, worker_init_fn=worker_init_fn, persistent_workers=persistent_workers) @@ -375,7 +376,7 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, for name, ds in ds_or_db.items(): if 'train' in name: dl = PaddleDataLoader(ds, feed_list=feed_list, places=places, return_list=return_list, - batch_sampler=batch_sampler, batch_size=train_batch_size, shuffle=shuffle, + batch_sampler=batch_sampler, batch_size=batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=collate_fn, num_workers=num_workers, use_shared_memory=use_shared_memory, use_buffer_reader=use_buffer_reader, timeout=timeout, worker_init_fn=worker_init_fn, @@ -383,7 +384,7 @@ def prepare_paddle_dataloader(ds_or_db, feed_list=None, places=None, else: dl = PaddleDataLoader(ds, feed_list=feed_list, places=places, return_list=return_list, batch_sampler=batch_sampler, - batch_size=non_train_batch_size if non_train_batch_size else train_batch_size, + batch_size=non_train_batch_size if non_train_batch_size else batch_size, shuffle=shuffle, drop_last=drop_last, collate_fn=collate_fn, num_workers=num_workers, use_shared_memory=use_shared_memory, use_buffer_reader=use_buffer_reader, diff --git a/fastNLP/core/dataloaders/torch_dataloader/fdl.py b/fastNLP/core/dataloaders/torch_dataloader/fdl.py index 7c54bed7..99faec7e 100644 --- a/fastNLP/core/dataloaders/torch_dataloader/fdl.py +++ b/fastNLP/core/dataloaders/torch_dataloader/fdl.py @@ -197,7 +197,7 @@ class TorchDataLoader(DataLoader): def prepare_torch_dataloader(ds_or_db, - train_batch_size: int = 16, + batch_size: int = 16, shuffle: bool = False, train_sampler: Union["Sampler[int]", ReproducibleSampler, UnrepeatedSampler] = None, batch_sampler: Union["Sampler[Sequence[int]]", ReproducibleBatchSampler] = None, @@ -229,8 +229,9 @@ def prepare_torch_dataloader(ds_or_db, :param ds_or_db: 传进来的dataset集合或字典或为dataset或DataBundle。其取值只能为``[Dataset, DataBundle, Sequence[Dataset], Dict[name, Dataset]]``. :param shuffle: 是否打乱数据集 - :param train_batch_size: 'train'数据集使用的batch_size,跟non_train_batch_size是互斥的。 - :param non_train_batch_size: 非'train'数据使用batch_size,跟train_batch_size是互斥的。 + :param batch_size: batch 的大小。 + :param non_train_batch_size: 如果传入的 ``ds_or_db`` 为 ``Dict`` 或 :class:`~fastNLP.io.DataBundle` 对象,可以通过改参数 + 设置名称不为 `train` 的其他 ``dataset`` 的 ``batch_size``。 :param train_sampler: train'数据集使用的sampler, 现了__len__和__iter__方法的实例化对象,其功能是每次返回dataset的一个index,当其不为None时,shuffle参数无效 :param non_train_sampler: 非'train'数据使用sampler, 实现了__len__和__iter__方法的实例化对象,其功能是每次返回dataset的一个index,当其不为None时,shuffle参数无效 :param batch_sampler: 实现了__len__和__iter__方法的实例化对象,,其能迭代返回一个list的index数据, index不超过dataset的大小, @@ -259,7 +260,7 @@ def prepare_torch_dataloader(ds_or_db, from fastNLP.io import DataBundle if isinstance(ds_or_db, DataSet): - dl = TorchDataLoader(dataset=ds_or_db, batch_size=train_batch_size, + dl = TorchDataLoader(dataset=ds_or_db, batch_size=batch_size, shuffle=shuffle, sampler=train_sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, @@ -272,7 +273,7 @@ def prepare_torch_dataloader(ds_or_db, dl_bundle = {} for name, ds in ds_or_db.iter_datasets(): if 'train' in name: - dl_bundle[name] = TorchDataLoader(dataset=ds, batch_size=train_batch_size, + dl_bundle[name] = TorchDataLoader(dataset=ds, batch_size=batch_size, shuffle=shuffle, sampler=train_sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, @@ -282,7 +283,7 @@ def prepare_torch_dataloader(ds_or_db, ) else: dl_bundle[name] = TorchDataLoader(dataset=ds, - batch_size=non_train_batch_size if non_train_batch_size else train_batch_size, + batch_size=non_train_batch_size if non_train_batch_size else batch_size, shuffle=shuffle, sampler=non_train_sampler if non_train_sampler else train_sampler, batch_sampler=batch_sampler, @@ -298,10 +299,10 @@ def prepare_torch_dataloader(ds_or_db, dl_bundle = [] for idx, ds in enumerate(ds_or_db): if idx > 0: - train_batch_size = non_train_batch_size if non_train_batch_size else train_batch_size + batch_size = non_train_batch_size if non_train_batch_size else batch_size train_sampler = non_train_sampler if non_train_sampler else train_sampler dl_bundle.append( - TorchDataLoader(dataset=ds, batch_size=train_batch_size, + TorchDataLoader(dataset=ds, batch_size=batch_size, shuffle=shuffle, sampler=train_sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, @@ -315,7 +316,7 @@ def prepare_torch_dataloader(ds_or_db, dl_bundle = {} for name, ds in ds_or_db.items(): if 'train' in name: - dl_bundle[name] = TorchDataLoader(dataset=ds, batch_size=train_batch_size, + dl_bundle[name] = TorchDataLoader(dataset=ds, batch_size=batch_size, shuffle=shuffle, sampler=train_sampler, batch_sampler=batch_sampler, num_workers=num_workers, collate_fn=collate_fn, pin_memory=pin_memory, drop_last=drop_last, timeout=timeout, worker_init_fn=worker_init_fn, @@ -325,7 +326,7 @@ def prepare_torch_dataloader(ds_or_db, ) else: dl_bundle[name] = TorchDataLoader(dataset=ds, - batch_size=non_train_batch_size if non_train_batch_size else train_batch_size, + batch_size=non_train_batch_size if non_train_batch_size else batch_size, shuffle=shuffle, sampler=non_train_sampler if non_train_sampler else train_sampler, batch_sampler=batch_sampler,